In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from collections import Counter

stop_words = stopwords.words('english')
lem = WordNetLemmatizer()

import nltk
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [2]:
def cleaning(data):
    
    stop_words = stopwords.words('english')
    lem = WordNetLemmatizer()
    
    #1. Tokenize
    #text_tokens = word_tokenize(data.lower())
    text_tokens = word_tokenize(data) # removed lower for tagging
    
    #2.Remove puncs
    text_tokens = [t for t in text_tokens if t.isalpha()]
    
    #3. stop words
    text_tokens = [t for t in text_tokens if not t in stop_words]
    
    #4. Lemma
    text_tokens = [lem.lemmatize(t) for t in text_tokens]
    
    #5 Join
    return " ".join(text_tokens)

In [3]:
df = pd.read_csv('train.tsv', sep='\t')
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [4]:
df.Sentiment = df.Sentiment.apply(lambda x : "positive" if x in [3,4] else ('negative' if x in [0,1] else 2))
df.Sentiment.value_counts()

2           79582
positive    42133
negative    34345
Name: Sentiment, dtype: int64

In [5]:
df = df[df['Sentiment']!=2]
df.Sentiment.value_counts()

positive    42133
negative    34345
Name: Sentiment, dtype: int64

In [6]:
df = df[['Phrase','Sentiment']]
df.columns = ['sentences', 'sentiment']
df = df.reset_index(drop = True)
df.head()

Unnamed: 0,sentences,sentiment
0,A series of escapades demonstrating the adage ...,negative
1,good for the goose,positive
2,good,positive
3,"the gander , some of which occasionally amuses...",negative
4,amuses,positive


In [7]:
df.sentences[0]

'A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .'

In [8]:
df["sentences_2"] = df['sentences'].apply(cleaning) #Model
#df['sentences_3'] = df['sentences_2'].apply(lambda x : x.split()) #PoST
#df['sentences_4'] = df['sentences_3'].apply(lambda x : nltk.pos_tag(x)) #tag

# Vectorization

In [9]:
X = df["sentences_2"]
y = df["sentiment"]

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
vectorizer = CountVectorizer().fit(X)
X_count = vectorizer.transform(X)
count_df_X = pd.DataFrame( X_count.toarray(),columns = vectorizer.get_feature_names() )
count_df_X.head()

Unnamed: 0,aaa,aaliyah,aan,abagnale,abandon,abandoned,abbass,abbott,abbreviated,abc,...,ziyi,zoe,zombie,zone,zoning,zoolander,zoom,zucker,zwick,zzzzzzzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# tf-idf

In [10]:
X = df["sentences_2"]
y = df["sentiment"]

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
tf_idf_vectorizer = TfidfVectorizer().fit(X)
X_tf_idf = tf_idf_vectorizer.transform(X)

tf_idf_df_X = pd.DataFrame(X_tf_idf.toarray(), columns = tf_idf_vectorizer.get_feature_names())
tf_idf_df_X.head()

Unnamed: 0,aaa,aaliyah,aan,abagnale,abandon,abandoned,abbass,abbott,abbreviated,abc,...,ziyi,zoe,zombie,zone,zoning,zoolander,zoom,zucker,zwick,zzzzzzzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# MODELS

In [16]:
X1 = count_df_X  # count vectorize df X
X2 = tf_idf_df_X # tfidf vectorize df X

from sklearn.preprocessing import LabelEncoder
yorj = y.copy()

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
y

y = pd.DataFrame(y,columns = ['sentinent'])
y

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size = 0.2, random_state = 42)

In [32]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
log_model=LogisticRegression().fit(X_train,y_train)
y_pred=log_model.predict(X_test)

from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[[5683 1102]
 [ 779 7732]]
              precision    recall  f1-score   support

           0       0.88      0.84      0.86      6785
           1       0.88      0.91      0.89      8511

    accuracy                           0.88     15296
   macro avg       0.88      0.87      0.87     15296
weighted avg       0.88      0.88      0.88     15296



In [13]:
# from xgboost import XGBClassifier
# xgb_model = XGBClassifier().fit(X_train, y_train)
# y_pred = xgb_model.predict(X_test)

In [None]:
# from sklearn.model_selection import GridSearchCV
# from xgboost import XGBClassifier
# xgb = XGBClassifier()
# xgb_params = {"n_estimators": [50, 100], "subsample":[0.5,1], "max_depth":[3,7], "learning_rate":[0.1,0.3]}
# xgb_cv_model = GridSearchCV(xgb, xgb_params, cv = 3, n_jobs = -1, verbose = 2).fit(X_train, y_train)

# print(xgb_cv_model.best_params_)

In [34]:
from sklearn.naive_bayes import MultinomialNB
model_naive = MultinomialNB().fit(X_train,y_train)
y_pred=model_naive.predict(X_test)

from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

  return f(**kwargs)


[[5560 1225]
 [1009 7502]]
              precision    recall  f1-score   support

           0       0.85      0.82      0.83      6785
           1       0.86      0.88      0.87      8511

    accuracy                           0.85     15296
   macro avg       0.85      0.85      0.85     15296
weighted avg       0.85      0.85      0.85     15296



In [36]:
from sklearn.naive_bayes import MultinomialNB
model_naive = MultinomialNB().fit(X_train,y_train.values.ravel())
y_pred=model_naive.predict(X_test)

from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[5560 1225]
 [1009 7502]]
              precision    recall  f1-score   support

           0       0.85      0.82      0.83      6785
           1       0.86      0.88      0.87      8511

    accuracy                           0.85     15296
   macro avg       0.85      0.85      0.85     15296
weighted avg       0.85      0.85      0.85     15296



In [50]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text  import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=cleaning)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [57]:
Xorj = df["sentences_2"]
yorj = df["sentiment"]
from sklearn.model_selection import train_test_split
Xorj_train, Xorj_test, yorj_train, yorj_test = train_test_split(Xorj, yorj, test_size = 0.2, random_state = 42)

pipeline.fit(Xorj_train,yorj_train)
y_pred = pipeline.predict(Xorj_test)

from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(yorj_test, y_pred))
print(classification_report(yorj_test, y_pred))

[[3741 3044]
 [1655 6856]]
              precision    recall  f1-score   support

    negative       0.69      0.55      0.61      6785
    positive       0.69      0.81      0.74      8511

    accuracy                           0.69     15296
   macro avg       0.69      0.68      0.68     15296
weighted avg       0.69      0.69      0.69     15296



In [55]:
def cleaning(data):
    
    stop_words = stopwords.words('english')
    lem = WordNetLemmatizer()
    
    #1. Tokenize
    #text_tokens = word_tokenize(data.lower())
    text_tokens = word_tokenize(data) # removed lower for tagging
    
    #2.Remove puncs
    text_tokens = [t for t in text_tokens if t.isalpha()]
    
    #3. stop words
    text_tokens = [t for t in text_tokens if not t in stop_words]
    
    #4. Lemma
    text_tokens = [lem.lemmatize(t) for t in text_tokens]
    
    #5 Join
    return " ".join(text_tokens)

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text  import TfidfTransformer
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=cleaning)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', RandomForestClassifier()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

Xorj = df["sentences_2"]
yorj = df["sentiment"]
from sklearn.model_selection import train_test_split
Xorj_train, Xorj_test, yorj_train, yorj_test = train_test_split(Xorj, yorj, test_size = 0.2, random_state = 42)

pipeline.fit(Xorj_train,yorj_train)
y_pred = pipeline.predict(Xorj_test)

from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(yorj_test, y_pred))
print(classification_report(yorj_test, y_pred))

[[4907 1878]
 [1042 7469]]
              precision    recall  f1-score   support

    negative       0.82      0.72      0.77      6785
    positive       0.80      0.88      0.84      8511

    accuracy                           0.81     15296
   macro avg       0.81      0.80      0.80     15296
weighted avg       0.81      0.81      0.81     15296



In [56]:
def cleaning(data):
    
    stop_words = stopwords.words('english')
    lem = WordNetLemmatizer()
    
    #1. Tokenize
    #text_tokens = word_tokenize(data.lower())
    text_tokens = word_tokenize(data) # removed lower for tagging
    
    #2.Remove puncs
    text_tokens = [t for t in text_tokens if t.isalpha()]
    
    #3. stop words
    text_tokens = [t for t in text_tokens if not t in stop_words]
    
    #4. Lemma
    text_tokens = [lem.lemmatize(t) for t in text_tokens]
    
    #5 Join
    return " ".join(text_tokens)

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.feature_extraction.text  import TfidfTransformer
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=cleaning)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', XGBClassifier()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

Xorj = df["sentences_2"]
yorj = df["sentiment"]
from sklearn.model_selection import train_test_split
Xorj_train, Xorj_test, yorj_train, yorj_test = train_test_split(Xorj, yorj, test_size = 0.2, random_state = 42)

pipeline.fit(Xorj_train,yorj_train)
y_pred = pipeline.predict(Xorj_test)

from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(yorj_test, y_pred))
print(classification_report(yorj_test, y_pred))

[[3741 3044]
 [1655 6856]]
              precision    recall  f1-score   support

    negative       0.69      0.55      0.61      6785
    positive       0.69      0.81      0.74      8511

    accuracy                           0.69     15296
   macro avg       0.69      0.68      0.68     15296
weighted avg       0.69      0.69      0.69     15296



In [58]:
def cleaning(data):
    
    stop_words = stopwords.words('english')
    lem = WordNetLemmatizer()
    
    #1. Tokenize
    #text_tokens = word_tokenize(data.lower())
    text_tokens = word_tokenize(data) # removed lower for tagging
    
    #2.Remove puncs
    text_tokens = [t for t in text_tokens if t.isalpha()]
    
    #3. stop words
    text_tokens = [t for t in text_tokens if not t in stop_words]
    
    #4. Lemma
    text_tokens = [lem.lemmatize(t) for t in text_tokens]
    
    #5 Join
    return " ".join(text_tokens)

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text  import TfidfTransformer
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=cleaning)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', LogisticRegression()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

Xorj = df["sentences_2"]
yorj = df["sentiment"]
from sklearn.model_selection import train_test_split
Xorj_train, Xorj_test, yorj_train, yorj_test = train_test_split(Xorj, yorj, test_size = 0.2, random_state = 42)

pipeline.fit(Xorj_train,yorj_train)
y_pred = pipeline.predict(Xorj_test)

from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(yorj_test, y_pred))
print(classification_report(yorj_test, y_pred))

[[2442 4343]
 [1848 6663]]
              precision    recall  f1-score   support

    negative       0.57      0.36      0.44      6785
    positive       0.61      0.78      0.68      8511

    accuracy                           0.60     15296
   macro avg       0.59      0.57      0.56     15296
weighted avg       0.59      0.60      0.58     15296



In [59]:
def cleaning(data):
    
    stop_words = stopwords.words('english')
    lem = WordNetLemmatizer()
    
    #1. Tokenize
    #text_tokens = word_tokenize(data.lower())
    text_tokens = word_tokenize(data) # removed lower for tagging
    
    #2.Remove puncs
    text_tokens = [t for t in text_tokens if t.isalpha()]
    
    #3. stop words
    text_tokens = [t for t in text_tokens if not t in stop_words]
    
    #4. Lemma
    text_tokens = [lem.lemmatize(t) for t in text_tokens]
    
    #5 Join
    return " ".join(text_tokens)

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text  import TfidfTransformer,TfidfVectorizer, CountVectorizer

pipeline = Pipeline([
    ('bow', TfidfVectorizer(analyzer=cleaning)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', LogisticRegression()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

X = df["sentences_2"]
y = df["sentiment"]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)

from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[2391 4394]
 [1808 6703]]
              precision    recall  f1-score   support

    negative       0.57      0.35      0.44      6785
    positive       0.60      0.79      0.68      8511

    accuracy                           0.59     15296
   macro avg       0.59      0.57      0.56     15296
weighted avg       0.59      0.59      0.57     15296



In [60]:
X

0        A series escapade demonstrating adage good goo...
1                                               good goose
2                                                     good
3        gander occasionally amuses none amount much story
4                                                   amuses
                               ...                        
76473    quietly suggesting sadness obsession beneath H...
76474                                    sadness obsession
76475                                              sadness
76476                             forced avuncular chortle
76477                                    avuncular chortle
Name: sentences_2, Length: 76478, dtype: object

In [61]:
y

0        negative
1        positive
2        positive
3        negative
4        positive
           ...   
76473    negative
76474    negative
76475    negative
76476    negative
76477    positive
Name: sentiment, Length: 76478, dtype: object