In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_json('goemotions.json')
df.columns = ['post', 'emotion', 'sentiment']

In [3]:
df.head()

Unnamed: 0,post,emotion,sentiment
0,That game hurt.,sadness,negative
1,"You do right, if you don't care then fuck 'em!",neutral,neutral
2,Man I love reddit.,love,positive
3,"[NAME] was nowhere near them, he was by the Fa...",neutral,neutral
4,Right? Considering it’s such an important docu...,gratitude,positive


### 2.1. Process the dataset using feature_extraction.text.CountVectorizer to extract tokens/words and their frequencies.

In [4]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df.post)

In [5]:
print(f'The number of tokens: {X.shape[1]}')

The number of tokens: 30449


### 2.2. Split the dataset into 80% for training and 20% for testing. 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, df[['emotion', 'sentiment']], test_size=0.2)

### 2.3. Train and test the following classifiers, for both the emotion and the sentiment classification, using word frequency as features.

In [None]:
performance_data = []
def train_model(model, model_name):
    for label in ['emotion', 'sentiment']:
        model.fit(X_train, y_train[label])
        y_pred = model.predict(X_test)
        performance_data.append([f'{model_name} {label}', confusion_matrix(y_test[label], y_pred), classification_report(y_test[label], y_pred)])

In [None]:
def grid_search(model, model_name, params):
    clf = GridSearchCV(model, params, cv=2, n_jobs=-1)
    for label in ['emotion', 'sentiment']:
        clf.fit(X_train, y_train[label])
        param_str = ''
        for k, v in clf.best_params_.items():
            param_str += f'{k}_{v} '
        y_pred = clf.best_estimator_.predict(X_test)
        performance_data.append([f'{model_name} {param_str[:-1]} {label}', confusion_matrix(y_test[label], y_pred), classification_report(y_test[label], y_pred)])

#### 2.3.1. Base-MNB: a Multinomial Naive Bayes Classifier (naive bayes.MultinomialNB.html) with the default parameters.

In [None]:
base_mnb = MultinomialNB()
train_model(base_mnb, 'MultinomialNB')

#### 2.3.2. Base-DT: a Decision Tree (tree.DecisionTreeClassifier) with the default parameters

In [None]:
base_dt = DecisionTreeClassifier()
train_model(base_dt, 'DecisionTreeClassifier')

#### 2.3.3. Base-MLP: a Multi-Layered Perceptron (neural network.MLPClassifier) with the default parameters.

In [None]:
base_mlp = MLPClassifier()
train_model(base_mlp, 'MLPClassifier')

#### 2.3.4. Top-MNB: a better performing Multinomial Naive Bayes Classifier found using GridSearchCV

In [None]:
mnb_params = {'alpha': [0.5, 0, 2]}
grid_search(base_mnb, 'MultinomialNB', mnb_params)

#### 2.3.5. Top-DT: a better performing Decision Tree found using GridSearchCV. 

In [None]:
dt_params = {'criterion': ['gini', 'entropy'],
             'max_depth': [1, 3],
             'min_samples_split': [2, 3, 4]}
grid_search(base_dt, 'DecisionTreeClassifier', dt_params)

#### 2.3.6. Top-MLP: a better performing Multi-Layered Perceptron found using GridSearchCV.

In [None]:
mlp_params = {'activation': ['identity', 'logistic', 'tanh', 'relu'],
              'hidden_layer_sizes': [(30, 50), (10, 10, 10)],
              'solver': ['adam', 'sgd']}
grid_search(base_mlp, 'MLPClassifier', mlp_params)

### 2.4. For each of the 6 classifiers above and each of the classification tasks (emotion or sentiment), produce and save the following information in a file called performance.

In [None]:
with open('performance.txt', 'w') as f:
    for line in performance_data:
        line = list(map(str, line))
        f.write(f"{','.join(line)}\n")

### 2.5. Use tf-idf instead of word frequencies and redo all substeps of 2.3 above – you can use TfidfTransformer for this. Display the results of this experiment.

In [None]:
transformer = TfidfTransformer()
X_tfidf = transformer.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, df[['emotion', 'sentiment']], test_size=0.2)

In [None]:
performance_data = []
base_mnb = MultinomialNB()
train_model(base_mnb, 'MultinomialNB')

base_dt = DecisionTreeClassifier()
train_model(base_dt, 'DecisionTreeClassifier')

base_mlp = MLPClassifier()
train_model(base_mlp, 'MLPClassifier')

grid_search(base_mnb, 'MultinomialNB', mnb_params)

grid_search(base_dt, 'DecisionTreeClassifier', dt_params)

grid_search(base_mlp, 'MLPClassifier', mlp_params)

In [None]:
pd.DataFrame(performance_data, columns=['Model', 'confusion matrix', 'classfication report'])

In [None]:
from gensim.downloader import load
wordvec2model=load('word2vec-google-news-300')




In [None]:
from nltk