In [6]:
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score,precision_score,recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [7]:
nlp = spacy.load('en_core_web_lg')

In [8]:
df = pd.read_csv('../Datasets/train/text_emotion.csv')

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,emotions
0,0,i didnt feel humiliated,sadness
1,1,i can go from feeling so hopeless to so damned...,sadness
2,2,im grabbing a minute to post i feel greedy wrong,anger
3,3,i am ever feeling nostalgic about the fireplac...,love
4,4,i am feeling grouchy,anger


In [10]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

# Seperate data into train and test split

### Extract document and Target labels from the dataset

In [11]:
document = df['text']
sentiment = df['emotions']

In [12]:
def space(comment):
    doc = nlp(comment)
    return " ".join([token.lemma_ for token in doc])
document = document.apply(space)

In [13]:
document = document.apply(lambda x: " ".join(x.lower() for x in x.split()))


In [14]:
document = document.str.replace('[^\w\s]','')


In [15]:
def stopword(text):
    doc=nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop])

In [16]:
document = document.apply(stopword)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(document, sentiment, test_size=0.20, random_state=42)

In [18]:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

## Preprocessing

## Training Classifier

### Random Forrest

### Normal Technique

In [14]:
rf_normal = RandomForestClassifier()
rf_normal.fit(X_train, y_train)
y_pred = rf_normal.predict(X_test)

In [15]:
acc = round(accuracy_score(y_test,y_pred), 3)

In [16]:
print(acc)

0.484


In [17]:
rf = RandomForestClassifier()
scores = cross_val_score(rf,X_train,y_train,cv=5)

In [18]:
print(scores)
scores.mean()

[0.48997845 0.49525862 0.5        0.49073276 0.48469828]


0.49213362068965516

In [19]:
rf = RandomForestClassifier()
parameters = {
    'n_estimators': [5,50,100],
    'max_depth': [2,10,20,None]
}

cv = GridSearchCV(rf,parameters)
cv.fit(X_train,y_train)

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'max_depth': [2, 10, 20, None],
                         'n_estimators': [5, 50, 100]})

In [20]:
print_results(cv)

BEST PARAMS: {'max_depth': None, 'n_estimators': 100}

0.183 (+/-0.004) for {'max_depth': 2, 'n_estimators': 5}
0.179 (+/-0.0) for {'max_depth': 2, 'n_estimators': 50}
0.179 (+/-0.0) for {'max_depth': 2, 'n_estimators': 100}
0.214 (+/-0.05) for {'max_depth': 10, 'n_estimators': 5}
0.183 (+/-0.004) for {'max_depth': 10, 'n_estimators': 50}
0.182 (+/-0.003) for {'max_depth': 10, 'n_estimators': 100}
0.242 (+/-0.061) for {'max_depth': 20, 'n_estimators': 5}
0.219 (+/-0.023) for {'max_depth': 20, 'n_estimators': 50}
0.213 (+/-0.017) for {'max_depth': 20, 'n_estimators': 100}
0.414 (+/-0.011) for {'max_depth': None, 'n_estimators': 5}
0.485 (+/-0.005) for {'max_depth': None, 'n_estimators': 50}
0.49 (+/-0.01) for {'max_depth': None, 'n_estimators': 100}


In [21]:
rf2 = RandomForestClassifier(n_estimators=100,max_depth=None)
rf2.fit(X_train, y_train)

RandomForestClassifier()

In [22]:
y_pred = rf2.predict(X_test)
accuracy = round(accuracy_score(y_test,y_pred), 3)

In [23]:
accuracy

0.484

### Naive Bayes

### Normal Technique

In [24]:
MNB = MultinomialNB()
MNB.fit(X_train, y_train)

MultinomialNB()

In [25]:
predicted = MNB.predict(X_test)
accuracy_score = accuracy_score(predicted, y_test)
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

33.97%


In [26]:
MNB_cv = MultinomialNB()
scores = cross_val_score(MNB_cv,X_train,y_train,cv=5)
print(scores)
scores.mean()

[0.33211207 0.34148707 0.33502155 0.33146552 0.33286638]


0.33459051724137934

### Logistic Regression

In [19]:
lr = LogisticRegression()
grid = {"C":np.logspace(-3,3,7), "penalty":["l1", "l2"]}
grid_lr = GridSearchCV(lr, grid, cv=10)
grid_lr.fit(X_train, y_train)

Traceback (most recent call last):
  File "C:\Users\User\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\User\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\User\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 442, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\User\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\User\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penal

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Traceback (most recent call last):
  File "C:\Users\User\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\User\anaconda3\lib\site-packa

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

GridSearchCV(cv=10, estimator=LogisticRegression(),
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'penalty': ['l1', 'l2']})

In [20]:
grid_lr.best_params_

{'C': 1.0, 'penalty': 'l2'}

In [21]:
lr = LogisticRegression(C =1.0, penalty='l2')

In [22]:
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [23]:
predicted = lr.predict(X_test)

In [24]:
print(accuracy_score(predicted, y_test))

0.4843103448275862


In [25]:
lr_cv = LogisticRegression()
scores = cross_val_score(lr_cv,X_train,y_train,cv=5)
print(scores)
scores.mean()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[0.48911638 0.49795259 0.49903017 0.48674569 0.48577586]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.4917241379310345

### SVD

### Normal

In [26]:
svclassifier = SVC(kernel='sigmoid')

In [27]:
cross_val = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
accuracy_shopping = cross_val_score(svclassifier, X_train, y_train, cv=cross_val, scoring='accuracy')

accuracy_shopping.mean()

0.4960560344827586