# Sentimental analysis 
The competition from Kaggel "*Bag of Words Meets Bags of Popcorn*"

In [1]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.grid_search import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn import cross_validation

from sklearn.externals import joblib

In [2]:
# you can download data from the page of competition https://www.kaggle.com/c/word2vec-nlp-tutorial/data

imbd_train = pd.read_csv('labeledTrainData.tsv', delimiter='\t')
imbd_test = pd.read_csv('testData.tsv', delimiter='\t')

In [3]:
# Number of the train and test datasets

print(imbd_train.shape)
print(imbd_test.shape)

(25000, 3)
(25000, 2)


In [6]:
# The training dataset consists balanced number of the positive and negative reviews

print(imbd_train[imbd_train.sentiment == 1].shape)

(12500, 3)


# Create pipeline for finding the best parameters

In [7]:
pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english')),
        ('lr', LogisticRegression(n_jobs=-1))
    ])

In [10]:
parameters = {
    'lr__C': (0.01, 0.1, 10),
    'lr__penalty': ('l1', 'l2'),
    'tfidf__min_df': (0, 1, 3, 5),
    'tfidf__ngram_range': ((1, 1), (1, 2), (1, 3)),
    'tfidf__max_features': (None, 2000, 8000, 12000)
}

In [11]:
cv = cross_validation.StratifiedShuffleSplit(imbd_train.sentiment, test_size=0.2, random_state=42)

In [12]:
# Submissions are judged on area under the ROC curve. 

grid = RandomizedSearchCV(pipeline, parameters, scoring='roc_auc', cv=cv)

In [None]:
grid.fit(imbd_train.review, imbd_train.sentiment)

In [14]:
grid.best_score_

0.95387377600000012

In [15]:
grid.best_params_

{'lr__C': 10,
 'lr__penalty': 'l2',
 'tfidf__max_features': 12000,
 'tfidf__min_df': 0,
 'tfidf__ngram_range': (1, 2)}

# Create prediction

In [21]:
tfidf = TfidfVectorizer(ngram_range=(1, 2), stop_words='english', min_df=0)

In [22]:
# for fitting tfidf vectorizer will use reviews from both datasets
vect = tfidf.fit(imbd_train.review + imbd_test.review)

X_train = vect.transform(imbd_train.review)
X_test = vect.transform(imbd_test.review)

In [23]:
clf = LogisticRegression(C=10, penalty='l2')
clf.fit(X_train, imbd_train.sentiment)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [24]:
prediction = clf.predict_proba(X_test)[:,1]

In [25]:
response = pd.DataFrame(data={'id': imbd_test.id, 'sentiment': prediction})
response.to_csv('tfidf_lr.csv', index=False)

The score for the _testing_ dataset is **0.95404**

# Data analysis of the prediction model

In [26]:
def display_important_features(feature_names, target_names, weights, n_top=30):
    sorted_features_indices = weights[0].argsort()[::-1]
    
    print('The most important "features" (words) for the first class: \n')
    most_important = sorted_features_indices[:n_top]
    print(",\n".join("{0}: {1:.4f}".format(feature_names[j], weights[0, j]) for j in most_important))

    print('\nThe most unimportant "features" (words) for the first class: \n')
    least_important = sorted_features_indices[-n_top:]
    print(",\n".join("{0}: {1:.4f}".format(feature_names[j], weights[0, j]) for j in least_important))

In [28]:
display_important_features(tfidf.get_feature_names(), imbd_train.sentiment.unique(), clf.coef_)

The most important "features" (words) for the first class: 

great: 17.1706,
excellent: 14.1051,
best: 12.3115,
wonderful: 11.1220,
perfect: 10.7192,
amazing: 9.7274,
love: 9.6657,
favorite: 9.5664,
fun: 9.1872,
loved: 8.8379,
today: 8.5322,
enjoyed: 8.1083,
brilliant: 8.0669,
beautiful: 7.7421,
superb: 7.7398,
highly: 7.1639,
enjoy: 6.9930,
definitely: 6.9677,
fantastic: 6.8260,
enjoyable: 6.5264,
10 10: 6.5148,
job: 6.4521,
bit: 6.3647,
liked: 6.3429,
life: 6.2765,
especially: 6.2463,
entertaining: 6.1633,
heart: 6.0502,
funniest: 6.0379,
rare: 5.9032

The most unimportant "features" (words) for the first class: 

badly: -7.2644,
pointless: -7.3440,
avoid: -7.6105,
disappointing: -7.6699,
just: -7.6744,
save: -7.6953,
lame: -7.8409,
mess: -7.8716,
waste time: -7.9765,
fails: -8.0054,
ridiculous: -8.2217,
minutes: -8.3099,
disappointment: -8.3445,
annoying: -8.4698,
instead: -8.5852,
stupid: -8.6196,
supposed: -8.7243,
unfortunately: -8.9381,
script: -9.1088,
dull: -9.3347,
poorly: -9