# Logistic Regression with Countvectorizer and TFIDF with unigram and bigram.

In [1]:
import pandas as pd
from wordcloud import WordCloud 
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
df = pd.read_csv("../input_data/clean_tweet_without_NaN.csv")

In [3]:
x = df.text
y = df.target

In [4]:
SEED = 2000
x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size=.02, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED)

In [5]:
print "Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_train),
                                                                             (len(x_train[y_train == 0]) / (len(x_train)*1.))*100,
                                                                            (len(x_train[y_train == 4]) / (len(x_train)*1.))*100)
print "Validation set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_validation),
                                                                             (len(x_validation[y_validation == 0]) / (len(x_validation)*1.))*100,
                                                                            (len(x_validation[y_validation == 4]) / (len(x_validation)*1.))*100)
print "Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_test),
                                                                             (len(x_test[y_test == 0]) / (len(x_test)*1.))*100,
                                                                            (len(x_test[y_test == 4]) / (len(x_test)*1.))*100)

Train set has total 1553896 entries with 50.02% negative, 49.98% positive
Validation set has total 15856 entries with 50.26% negative, 49.74% positive
Test set has total 15857 entries with 49.90% negative, 50.10% positive


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from time import time
import numpy as np

In [7]:
def accuracy_summary(pipeline, x_train, y_train, x_test, y_test):
    if len(x_test[y_test == 0]) / (len(x_test)*1.) > 0.5:
        null_accuracy = len(x_test[y_test == 0]) / (len(x_test)*1.)
    else:
        null_accuracy = 1. - (len(x_test[y_test == 0]) / (len(x_test)*1.))
    t0 = time()
    sentiment_fit = pipeline.fit(x_train, y_train)
    y_pred = sentiment_fit.predict(x_test)
    train_test_time = time() - t0
    accuracy = accuracy_score(y_test, y_pred)
    print "null accuracy: {0:.2f}%".format(null_accuracy*100)
    print "accuracy score: {0:.2f}%".format(accuracy*100)
    if accuracy > null_accuracy:
        print "model is {0:.2f}% more accurate than null accuracy".format((accuracy-null_accuracy)*100)
    elif accuracy == null_accuracy:
        print "model has the same accuracy with the null accuracy"
    else:
        print "model is {0:.2f}% less accurate than null accuracy".format((null_accuracy-accuracy)*100)
    print "train and test time: {0:.2f}s".format(train_test_time)
    print "-"*80
    return accuracy, train_test_time

In [8]:
cvec = CountVectorizer()
tvec = TfidfVectorizer()
lr = LogisticRegression()
nb = GaussianNB()
svm_obj = LinearSVC(C=0.1)

n_features = np.arange(10000,100001,10000)

def nfeature_accuracy_checker(vectorizer=cvec, n_features=n_features, stop_words=None, ngram_range=(1, 1), classifier=lr):
    result = []
    print (classifier)
    print "\n"
    for n in n_features:
        vectorizer.set_params(stop_words=stop_words, max_features=n, ngram_range=ngram_range)
        checker_pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', classifier)
        ])
        print "Validation result for {} features".format(n)
        nfeature_accuracy,tt_time = accuracy_summary(checker_pipeline, x_train, y_train, x_validation, y_validation)
        result.append((n,nfeature_accuracy,tt_time))
    return result

In [9]:
%%time
print "RESULT FOR UNIGRAM WITH STOP WORDS (Tfidf)\n"
feature_result_ugt = nfeature_accuracy_checker(vectorizer=tvec)

RESULT FOR UNIGRAM WITH STOP WORDS (Tfidf)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)


Validation result for 10000 features




null accuracy: 50.26%
accuracy score: 77.03%
model is 26.77% more accurate than null accuracy
train and test time: 28.34s
--------------------------------------------------------------------------------
Validation result for 20000 features
null accuracy: 50.26%
accuracy score: 77.20%
model is 26.94% more accurate than null accuracy
train and test time: 32.70s
--------------------------------------------------------------------------------
Validation result for 30000 features
null accuracy: 50.26%
accuracy score: 77.18%
model is 26.92% more accurate than null accuracy
train and test time: 31.02s
--------------------------------------------------------------------------------
Validation result for 40000 features
null accuracy: 50.26%
accuracy score: 77.24%
model is 26.98% more accurate than null accuracy
train and test time: 34.98s
--------------------------------------------------------------------------------
Validation result for 50000 features
null accuracy: 50.26%
accuracy score: 77

In [10]:
%%time
print "RESULT FOR BIGRAM WITH STOP WORDS (Tfidf)\n"
feature_result_bgt = nfeature_accuracy_checker(vectorizer=tvec,ngram_range=(1, 2))

RESULT FOR BIGRAM WITH STOP WORDS (Tfidf)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)


Validation result for 10000 features
null accuracy: 50.26%
accuracy score: 77.31%
model is 27.05% more accurate than null accuracy
train and test time: 67.98s
--------------------------------------------------------------------------------
Validation result for 20000 features
null accuracy: 50.26%
accuracy score: 77.57%
model is 27.31% more accurate than null accuracy
train and test time: 67.45s
--------------------------------------------------------------------------------
Validation result for 30000 features
null accuracy: 50.26%
accuracy score: 77.97%
model is 27.71% more accurate than null accuracy
train and test time: 73.04s
----------------------------------------------------