## Random Forest Model

In [1]:
import numpy as np
import pandas as pd
from termcolor import colored
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Load data
train_data = pd.read_csv('clean_train.csv')
test_data = pd.read_csv('clean_test.csv')
print(colored("Data loaded", "yellow"))

[33mData loaded[0m


In [3]:
# Tf-IDF
print(colored("Applying TF-IDF transformation", "yellow"))
tfidfVectorizer = TfidfVectorizer(min_df = 5, max_features = 1000)
tfidfVectorizer.fit(train_data['Clean_tweet'].apply(lambda x: np.str_(x)))

train_tweet_vector = tfidfVectorizer.transform(train_data['Clean_tweet'].apply(lambda x: np.str_(x)))
test_tweet_vector = tfidfVectorizer.transform(test_data['Clean_tweet'].apply(lambda x: np.str_(x)))

[33mApplying TF-IDF transformation[0m


### Training data

In [4]:
print(colored("Training Random Forest Classifier", "yellow"))
randomForestClassifier = RandomForestClassifier()
randomForestClassifier.fit(train_tweet_vector, train_data['Sentiment'])

[33mTraining Random Forest Classifier[0m




RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

### Prediction

In [5]:
print(colored("Predicting on train data", "yellow"))
prediction = randomForestClassifier.predict(train_tweet_vector)
print(colored("Training accuracy: {}%".format(accuracy_score(train_data['Sentiment'], prediction)*100), "green"))

[33mPredicting on train data[0m
[32mTraining accuracy: 94.45476562500001%[0m


In [6]:
print(colored("Predicting on test data", "yellow"))
prediction = randomForestClassifier.predict(test_tweet_vector)
print(colored("Testing accuracy: {}%".format(accuracy_score(test_data['Sentiment'], prediction)*100), "green"))

[33mPredicting on test data[0m
[32mTesting accuracy: 74.4859375%[0m
