#### To Do List
1. Everyone experiment with different models - see if we can get a high predicition score
2. Figure out what Rios means by reporting validation results using cross validation
3. Figure out how to write code to replace the current labels with the predictions from our model

## Libraries

In [3]:
import csv
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score

## Data Preparation

#### Load the training and test sentiment datasets "test.tsv" and "train.tsv"

In [4]:
X_text_train = []
y_train = []

with open('train.tsv') as file:
    tsv_reader = csv.reader(file, delimiter = '\t', quoting = csv.QUOTE_NONE)
    
    for row in tsv_reader:
        X_text_train.append(row[1]) #text being processed
        y_train.append(row[2]) #label
        
X_text_test = []
y_test = []

with open('test.tsv') as file:
    tsv_reader = csv.reader(file, delimiter = '\t', quoting = csv.QUOTE_NONE)
    
    for row in tsv_reader:
        X_text_test.append(row[1]) #text being processed
        y_test.append(row[2]) #label

#### Convert X_text_train and X_text_test to matricies of numbers 

In [5]:
np.random.seed(42)
random.seed(42)

vec = CountVectorizer(ngram_range = (1,1))

X_train = vec.fit_transform(X_text_train) 
X_test = vec.transform(X_text_test) 

<br>

## (1) LinearSVC Classifier

#### Initialize the classifier LinearSVC, Create the params with the C values

In [6]:
svc = LinearSVC()

params = {"C": [0.0001, 0.001, 0.001, 0.01, 0.1, 1., 10., 100.]}

#### Initialize GridSearchCV and Fit the Model

In [7]:
clf = GridSearchCV(svc, params, cv = 20, scoring = 'f1_micro')
clf.fit(X_train, y_train)



GridSearchCV(cv=20, estimator=LinearSVC(),
             param_grid={'C': [0.0001, 0.001, 0.001, 0.01, 0.1, 1.0, 10.0,
                               100.0]},
             scoring='f1_micro')

#### Get the score from the GridSearchCV "best score" and Best Parameters

In [8]:
validation_score = clf.best_score_ 
print("Validation F1: {:.4f}".format(validation_score))

best_parameters = clf.best_params_
print(f"Best Params: {best_parameters}")

Validation F1: 0.7357
Best Params: {'C': 0.1}


#### "predict" on X_test 

In [9]:
svm_test_predictions = clf.predict(X_test) 

#### Get scores using svm_test_predictions and y_test with the precision_score method

In [11]:
precision = precision_score(y_test, svm_test_predictions, average = 'micro')
recall = recall_score(y_test, svm_test_predictions, average = 'micro')
f1 = f1_score(y_test, svm_test_predictions, average = 'micro')

print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("F1: {:.4f}".format(f1))

Precision: 0.7753
Recall: 0.7753
F1: 0.7753


#### Store predictions and write to a new CSV

In [42]:
# Store predictions
svm_test_predictions_df = pd.DataFrame(data = svm_test_predictions)

# Write to a csv
with open('test.tsv') as file:
    test_file = csv.reader(file, delimiter = '\t', quoting = csv.QUOTE_NONE)
    test_df = pd.DataFrame(test_file)
    
test_df[2] = svm_test_predictions_df[0]
test_df.columns = ['TWITTER_ID', 'TEXT', 'PREDICTIONS']
test_df.to_csv('00. LinearSVC Prediction Results.csv', index = False)

<br>

## (2) RandomForestClassifier

#### Initialize the RandomForest classifier

In [36]:
rand_forest = RandomForestClassifier()
parameters = {'n_estimators': [10, 100, 200, 300, 400]}

#### Initialize GridSearchCV and Fit the Model

In [37]:
clf_rand = GridSearchCV(rand_forest, parameters, cv = 2)
clf_rand.fit(X_train, y_train)

GridSearchCV(cv=2, estimator=RandomForestClassifier(),
             param_grid={'n_estimators': [10, 100, 200, 300, 400]})

#### Get the score from the GridSearchCV "best score" and Best Parameters

In [38]:
validation_score = clf_rand.best_score_ 
print("Validation F1: {:.4f}".format(validation_score))

best_parameters = clf_rand.best_params_
print(f"Best Params: {best_parameters}")

Validation F1: 0.7317
Best Params: {'n_estimators': 100}


#### "predict" on X_test 

In [39]:
rand_test_predictions = clf_rand.predict(X_test) 

#### Get scores using rand_forest_predictions and y_test with the precision_score method

In [40]:
precision = precision_score(y_test, rand_test_predictions, average = 'micro')
recall = recall_score(y_test, rand_test_predictions, average = 'micro')
f1 = f1_score(y_test, rand_test_predictions, average = 'micro')

print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("F1: {:.4f}".format(f1))

Precision: 0.8606
Recall: 0.8606
F1: 0.8606


#### Store predictions and write to CSV

In [43]:
# Store predictions
rand_test_predictions_df = pd.DataFrame(data = rand_test_predictions)

# Write to a csv
with open('test.tsv') as file:
    test_file = csv.reader(file, delimiter = '\t', quoting = csv.QUOTE_NONE)
    test_df = pd.DataFrame(test_file)
    
test_df[2] = rand_test_predictions_df[0]
test_df.columns = ['TWITTER_ID', 'TEXT', 'PREDICTIONS']
test_df.to_csv('00. RandomForest Prediction Results.csv', index = False)

<br>

## Classifier