In [1]:
import copy
import pandas as pd
import numpy as np
from datetime import datetime

import nltk
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from libs.MongoConnect import MongoConnect
from libs.conf import *

collector = MongoConnect(MONGO_GREEN_IP + ':' + str(MONGO_GREEN_PORT), 'visitor', 'fb_adverts')

In [2]:
cursor = collector.collection.find({})
ds = list()
for doc in cursor:
    ds.append(doc)
df = pd.DataFrame(ds)

#### text processing

In [3]:
tokenizer = nltk.tokenize.WordPunctTokenizer()
df['origial_text'] = df.text
df['text'] = df.text.apply(lambda row: ' '.join(tokenizer.tokenize(str(row).lower())))

#### date processing

In [4]:
def add_date_parts(df, date_column= 'published'):
    df['hour'] = df[date_column].dt.hour
    df['month'] = df[date_column].dt.month
    df['weekday'] = df[date_column].dt.weekday
    df['year'] = df[date_column].dt.year
    df['week'] = df[date_column].dt.week
    df['month_day'] = df[date_column].dt.day

In [5]:
df.time = df.time.apply(lambda ts: datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'))
df.time = pd.to_datetime(df.time, format='%Y-%m-%d %H:%M:%S')
add_date_parts(df, 'time')

#### stuff processing

In [6]:
df['was_photo'] = (df.photos.str.len() != 0)
df['was_link'] = df.outer_link.notna()
df.write_to_seller = (df.write_to_seller == 'Message Seller')

#### location processing

In [7]:
df['location'] = df['location'].str.split(',').str[0] # get rid of the state

In [8]:
df.loc[df['location'] == 'Boston', 'location'] = None

#### saving

In [9]:
DATA_PATH = '../data/'
clean_df = df.drop(['link', 'author', 'outer_link', 'photos', 'time'], axis=1)
clean_df.to_feather(DATA_PATH + 'housing_clean')