## sentiment analysis

In [1]:
#!pip install git+https://github.com/laxmimerit/preprocess_kgptalkie.git --upgrade --force-reinstall
#!pip install xgboost


In [2]:
import pandas as pd
import numpy as np
import preprocess_kgptalkie as ps
import warnings 
warnings.filterwarnings(action = "ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [4]:
print(dir(ps))

['BeautifulSoup', 'NaiveBayesAnalyzer', 'TextBlob', 'Translator', 'Word', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', 'avg_word_len', 'char_count', 'clean_text', 'contraction_to_expansion', 'contractions', 'correct_spelling', 'count_emails', 'count_rt', 'count_urls', 'detect_language', 'download_nltk_data', 'extract_features', 'fpath', 'get_noun_phrase', 'hashtags_count', 'json', 'lemmatize', 'lemmatize_noun_verb', 'mentions_count', 'n_gram', 'nlp', 'nltk', 'numerics_count', 'os', 'pluralize_words', 're', 'remove_accented_chars', 'remove_common_words', 'remove_emails', 'remove_html_tags', 'remove_mentions', 'remove_rare_words', 'remove_repeated_chars', 'remove_rt', 'remove_special_chars', 'remove_stop_words', 'remove_urls', 'sentiment_analysis', 'singularize_words', 'spacy', 'stop_words_count', 'stopwords', 'text_preprocess', 'to_lower_case', 'translate', 'unicodedata', 'upper_case_count', 'word_count']


In [5]:
df = pd.read_csv('data/imdb_reviews.txt', sep = '\t', header = None)

In [6]:
df.columns = ['reviews', 'sentiment']

In [7]:
df.head()

Unnamed: 0,reviews,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [8]:
x = 'A very, very, very slow-moving, aimless movie'
#ps.remove_accented_chars(x)
#ps.remove_emails(x)
#ps.remove_html_tags(x)
#ps.remove_special_chars(x)
#ps.remove_urls(x)
#ps.contraction_to_expansion(x)
ps.correct_spelling(x)


'A very very very slow-moving, aimless movie'

In [9]:
df['reviews'] = df['reviews'].apply(lambda x: ps.contraction_to_expansion(x))
df['reviews'] = df['reviews'].apply(lambda x: ps.remove_special_chars(x))

df['reviews'] = df['reviews'].apply(lambda x: ps.remove_accented_chars(x))
df['reviews'] = df['reviews'].apply(lambda x: ps.remove_emails(x))
df['reviews'] = df['reviews'].apply(lambda x: ps.remove_html_tags(x))
df['reviews'] = df['reviews'].apply(lambda x: ps.remove_urls(x))
df['reviews'] = df['reviews'].apply(lambda x: str(x).lower())

In [10]:
df

Unnamed: 0,reviews,sentiment
0,a very very very slowmoving aimless movie abou...,0
1,not sure who was more lost the flat character...,0
2,attempting artiness with black white and clev...,0
3,very little music or anything to speak of,0
4,the best scene in the movie was when gerardo i...,1
...,...,...
743,i just got bored watching jessice lange take h...,0
744,unfortunately any virtue in this films product...,0
745,in a word it is embarrassing,0
746,exceptionally bad,0


In [11]:
X = df['reviews']
y = df['sentiment']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify = y)

In [13]:
X_train.shape, X_test.shape,y_test.shape

((598,), (150,), (150,))

## ML model Building

In [14]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(solver = 'liblinear'))
])

In [15]:
hyperparameters = {
    'tfidf__max_df': (0.5, 1.0),
    'tfidf__ngram_range': ((1,1), (1,2)),
    'tfidf__use_idf': (True, False),
    'tfidf__analyzer': ('word', 'char', 'char_wb'),
    'clf__penalty': ('l2', 'l1'),
    'clf__C': (1,2)
}

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier

pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', XGBClassifier(
        use_label_encoder=False,  # avoid warnings
        eval_metric='mlogloss'    # required for multi-class/logistic loss
    ))
])


In [17]:
hyperparameters = {
    'tfidf__max_df': [0.5, 1.0],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [3, 5, 7],
    'clf__learning_rate': [0.1, 0.3],
    'clf__subsample': [0.8, 1.0]
}


In [18]:
clf = GridSearchCV(pipe, hyperparameters, n_jobs=-1, cv = None)

In [19]:
%%time
clf.fit(X_train, y_train)

CPU times: total: 5.05 s
Wall time: 1min 25s


0,1,2
,estimator,"Pipeline(step...=None, ...))])"
,param_grid,"{'clf__learning_rate': [0.1, 0.3], 'clf__max_depth': [3, 5, ...], 'clf__n_estimators': [100, 200], 'clf__subsample': [0.8, 1.0], ...}"
,scoring,
,n_jobs,-1
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [20]:
clf.best_estimator_

0,1,2
,steps,"[('tfidf', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [21]:
clf.best_params_

{'clf__learning_rate': 0.1,
 'clf__max_depth': 3,
 'clf__n_estimators': 200,
 'clf__subsample': 1.0,
 'tfidf__max_df': 1.0,
 'tfidf__ngram_range': (1, 1)}

In [22]:
clf.best_score_

np.float64(0.6421008403361343)

In [23]:
y_pred = clf.predict(X_test)

In [24]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.63      0.65        73
           1       0.67      0.71      0.69        77

    accuracy                           0.67       150
   macro avg       0.67      0.67      0.67       150
weighted avg       0.67      0.67      0.67       150



### using Random Forest


In [25]:
from sklearn.ensemble import RandomForestClassifier


In [26]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

pipe2 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', RandomForestClassifier(random_state=0))
])


In [27]:
hyperparameters = {
    'tfidf__max_df': [0.5, 1.0],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'clf__n_estimators': [100, 200, 500],     # number of trees
    'clf__max_depth': [None, 10, 20, 50],     # depth of each tree
    'clf__min_samples_split': [2, 5, 10],     # min samples to split a node
    'clf__min_samples_leaf': [1, 2, 4]        # min samples per leaf
}


In [28]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(pipe2, hyperparameters, n_jobs=-1, cv=3, verbose=1)
clf.fit(X_train, y_train)

print("Best Parameters:", clf.best_params_)
print("Best CV Score:", clf.best_score_)

y_pred = clf.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Fitting 3 folds for each of 432 candidates, totalling 1296 fits
Best Parameters: {'clf__max_depth': 20, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 5, 'clf__n_estimators': 500, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 1)}
Best CV Score: 0.6873283082077052
Test Accuracy: 0.7133333333333334
              precision    recall  f1-score   support

           0       0.74      0.63      0.68        73
           1       0.69      0.79      0.74        77

    accuracy                           0.71       150
   macro avg       0.72      0.71      0.71       150
weighted avg       0.72      0.71      0.71       150



## using SVM

In [29]:
from sklearn.svm import LinearSVC

In [30]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC())
])

In [31]:
hyperparameters = {
    'tfidf__max_df': (0.5, 1.0),
    'tfidf__ngram_range': ((1,1), (1,2)),
    'tfidf__use_idf': (True, False),
    'tfidf__analyzer': ('word', 'char', 'char_wb'),
    'clf__C': (1,2,2.5,3)
}

clf = GridSearchCV(pipe, hyperparameters, n_jobs=-1, cv = 5)

In [32]:
%%time
clf.fit(X_train, y_train)

CPU times: total: 703 ms
Wall time: 3.66 s


0,1,2
,estimator,Pipeline(step...LinearSVC())])
,param_grid,"{'clf__C': (1, ...), 'tfidf__analyzer': ('word', ...), 'tfidf__max_df': (0.5, ...), 'tfidf__ngram_range': ((1, ...), ...), ...}"
,scoring,
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,2
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [33]:
clf.best_estimator_

0,1,2
,steps,"[('tfidf', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,2
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [34]:
clf.best_score_

np.float64(0.7674789915966387)

In [36]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.75      0.76        73
           1       0.77      0.79      0.78        77

    accuracy                           0.77       150
   macro avg       0.77      0.77      0.77       150
weighted avg       0.77      0.77      0.77       150



In [37]:
x = ['this is great moview. i loved it', 'i have watched this movie. plot is straight. return my money']
clf.predict(x)

array([1, 0])

In [None]:
import pickle as pkl
pkl.dump(clf, open('model.pkl', 'wb'))