In [48]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
from sklearn.externals import joblib
from imblearn.over_sampling import RandomOverSampler
import os
import matplotlib.pyplot as plt
%matplotlib inline

In [49]:
def preprocess_tweet(text):
    # Tokenize the tweet text
    tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=False)
    tokens = tokenizer.tokenize(text)

    # remove links
    no_url_tokens = [word for word in tokens if 'http' not in word]

    no_url_joined = " ".join(no_url_tokens)

    # remove stop words and punctuation
    stop_words = set(stopwords.words('english'))
    punct = string.punctuation
    punct_1 = punct.replace('#', '')
    punct_2 = punct_1.replace('@', '')
    stop_words.update(punct_2)
    stop_words.add('...')

    filtered_tokens = [word for word in no_url_tokens if not word in stop_words]

    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) if (word[0] != '#' and word[0] != '@') else word for word in filtered_tokens]

    return {'semi_processed_text': no_url_joined, 'processed_text': stemmed_tokens}

def get_input_data(filename, directory):
    df = pd.read_csv(filename,encoding="latin-1")
    X_temp = list(df["text"])
    y = list(df["relevance"])
    
    X = []
    i = 0
    for item in X_temp:
        try:
            doc = preprocess_tweet(item)
            X.append(" ".join(doc["processed_text"]))
            i += 1
        except:
            print(i)
            return
        

    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(X)
    joblib.dump(count_vect, str(os.path.join(directory,"count_vectorizer.plk" )))
    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    joblib.dump(tf_transformer, str(os.path.join(directory,"tf_transformer.plk")))      
        
    ros = RandomOverSampler(random_state=42)
    X_res, y_res = ros.fit_sample(X_train_tf, y)
    #X_res = X_train_tf
    #y_res = y
        
    return X_res, y_res

In [50]:
def train_model(X_train, y_train, model_dir, model_name):
    if model_name == "naive_bayes.plk":
        clf = MultinomialNB().fit(X_train, y_train)
        joblib.dump(clf, str(os.path.join(model_dir, model_name)))
    if model_name == "logistic_regression.plk":
        clf = LogisticRegression(class_weight="balanced", C=1.0)
        clf.fit(X_train, y_train)
        joblib.dump(clf, str(os.path.join(model_dir, model_name)))
    if model_name == "random_forest.plk":
        clf = RandomForestClassifier(n_estimators=100)
        clf.fit(X_train, y_train)
        joblib.dump(clf, str(os.path.join(model_dir, model_name)))

In [51]:
def main(filename, model_dir, model_name,train=False):
    X, y = get_input_data(filename, model_dir)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    if train:
        print("Training model.")
        train_model(X_train,y_train,model_dir, model_name)
    
    classifier = joblib.load(str(os.path.join(model_dir, model_name))) 
    
    predicted = classifier.predict(X_test)
    print("Accuracy: ")
    print(np.mean(predicted == y_test))    

In [52]:
main(str(os.path.join("..", "..", "data_dir", "twitter_naive_bayes", "bmw_training.csv")),
    str(os.path.join("..","..","models")), "random_forest.plk",
    True)

  exec(code_obj, self.user_global_ns, self.user_ns)


Training model.
Accuracy: 
0.997041740323
