In [1]:
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score,precision_score,recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
df = pd.read_csv('../Datasets/train/text_emotion.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,emotions
0,0,i didnt feel humiliated,sadness
1,1,i can go from feeling so hopeless to so damned...,sadness
2,2,im grabbing a minute to post i feel greedy wrong,anger
3,3,i am ever feeling nostalgic about the fireplac...,love
4,4,i am feeling grouchy,anger


In [5]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

# Seperate data into train and test split

### Extract document and Target labels from the dataset

In [6]:
document = df['text']
sentiment = df['emotions']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(document, sentiment, test_size=0.20, random_state=42)

In [8]:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

## Preprocessing

## Training Classifier

### Random Forrest

### Normal Technique

In [None]:
rf_normal = RandomForestClassifier()
rf_normal.fit(X_train, y_train)
y_pred = rf_normal.predict(X_test)

In [None]:
acc = round(accuracy_score(y_test,y_pred), 3)

In [None]:
print(acc)

In [9]:
rf = RandomForestClassifier()
scores = cross_val_score(rf,X_train,y_train,cv=5)

In [10]:
print(scores)
scores.mean()

[0.48868534 0.49321121 0.49385776 0.49407328 0.47920259]


0.4898060344827586

In [11]:
rf = RandomForestClassifier()
parameters = {
    'n_estimators': [5,50,100],
    'max_depth': [2,10,20,None]
}

cv = GridSearchCV(rf,parameters)
cv.fit(X_train,y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              ra

In [12]:
print_results(cv)

BEST PARAMS: {'max_depth': None, 'n_estimators': 100}

0.185 (+/-0.005) for {'max_depth': 2, 'n_estimators': 5}
0.179 (+/-0.0) for {'max_depth': 2, 'n_estimators': 50}
0.179 (+/-0.0) for {'max_depth': 2, 'n_estimators': 100}
0.205 (+/-0.008) for {'max_depth': 10, 'n_estimators': 5}
0.184 (+/-0.003) for {'max_depth': 10, 'n_estimators': 50}
0.183 (+/-0.004) for {'max_depth': 10, 'n_estimators': 100}
0.234 (+/-0.011) for {'max_depth': 20, 'n_estimators': 5}
0.212 (+/-0.012) for {'max_depth': 20, 'n_estimators': 50}
0.212 (+/-0.007) for {'max_depth': 20, 'n_estimators': 100}
0.408 (+/-0.015) for {'max_depth': None, 'n_estimators': 5}
0.484 (+/-0.006) for {'max_depth': None, 'n_estimators': 50}
0.491 (+/-0.01) for {'max_depth': None, 'n_estimators': 100}


In [13]:
rf2 = RandomForestClassifier(n_estimators=100,max_depth=None)
rf2.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [14]:
y_pred = rf2.predict(X_test)
accuracy = round(accuracy_score(y_test,y_pred), 3)

In [15]:
accuracy

0.487

### Naive Bayes

### Normal Technique

In [16]:
MNB = MultinomialNB()
MNB.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [17]:
predicted = MNB.predict(X_test)
accuracy_score = accuracy_score(predicted, y_test)
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

34.12%


In [18]:
MNB_cv = MultinomialNB()
scores = cross_val_score(MNB_cv,X_train,y_train,cv=5)
print(scores)
scores.mean()

[0.33297414 0.33782328 0.33286638 0.33232759 0.33329741]


0.33385775862068967

### Logistic Regression

In [19]:
lr = LogisticRegression()
grid = {"C":np.logspace(-3,3,7), "penalty":["l1", "l2"]}
grid_lr = GridSearchCV(lr, grid, cv=10)
grid_lr.fit(X_train, y_train)

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



KeyboardInterrupt: 

In [None]:
grid_lr.best_params_

In [None]:
lr = LogisticRegression(C =1.0, penalty='l2')

In [None]:
lr.fit(X_train, y_train)

In [None]:
predicted = lr.predict(X_test)

In [None]:
print(accuracy_score(predicted, y_test))

In [None]:
lr_cv = LogisticRegression()
scores = cross_val_score(lr_cv,X_train,y_train,cv=5)
print(scores)
scores.mean()

### SVD

### Normal

In [20]:
svclassifier = SVC(kernel='sigmoid')

In [21]:
cross_val = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
accuracy_shopping = cross_val_score(svclassifier, X_train, y_train, cv=cross_val, scoring='accuracy')

accuracy_shopping.mean()

0.5041163793103449