# Project 3 - Web APIs and Natural Language Processing

## Running a Random Forest 

In [10]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#### 1. Splitting the data into train and test

In [11]:
# Reading the data into a dataframe 
he_cg = pd.read_csv('./clean_subreddit.csv')

In [12]:
# Defining X and y 
X = he_cg['full_text']

In [13]:
y = he_cg['subreddit']

In [14]:
# Before splitting the data I want to see if classes are imbalanced. They are not
he_cg['subreddit'].value_counts()

careerguidance     9887
highereducation    9141
Name: subreddit, dtype: int64

In [15]:
# Splitting the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [16]:
# Using the CountVectorizer to vectorize each row. 
vectorizer = CountVectorizer()

In [17]:
# Fitting the vectorizer to my data 
vectorizer.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [18]:
# Transforming the data 
X_train = vectorizer.transform(X_train)

In [19]:
# Turning the data into a dataframe
X_train_df = pd.DataFrame(X_train.toarray(), columns = vectorizer.get_feature_names())
X_train_df.shape

(13319, 12882)

In [20]:
# Doing the same for the test data 
X_test = vectorizer.transform(X_test)

In [21]:
X_test_df = pd.DataFrame(X_test.toarray(), columns=vectorizer.get_feature_names())

X_test_df.shape

(5709, 12882)

In [22]:
# Calculating the baseline accuracy: 
y.value_counts(normalize=True)

careerguidance     0.519603
highereducation    0.480397
Name: subreddit, dtype: float64

In [33]:
# Instantiating the model and playing with a few parameters to control the number of leaves, the features 
# included, and the number of trees 

rf = RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_leaf=10, max_features=0.5)

In [34]:
# Fitting the model 
rf.fit(X_train_df, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features=0.5, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [1]:
# Evaluating the model on the train data 
rf.score(X_train_df, y_train)

NameError: name 'rf' is not defined

In [36]:
# Evaluating the model on the test data 
rf.score(X_test_df, y_test)

0.8420038535645472

In [41]:
# Setting the parameters to do a singe gridsearch. 
params = {
    'n_estimators': [100]
}

In [42]:
# Instantiating and fitting the model 
gs = GridSearchCV(rf, param_grid=params, return_train_score=True, cv=5)
gs.fit(X_train_df, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=5,
                                              max_features=0.5,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=10,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid='warn',

In [43]:
# Evaluating the model on my train data 
gs.score(X_train_df, y_train)

0.8490126886402883

In [44]:
# Evaluating the model on my test data
gs.score(X_test_df, y_test)

0.8413032054650552

In [45]:
# Getting the most important words for the model 
gs.best_estimator_.feature_importances_

array([0., 0., 0., ..., 0., 0., 0.])

In [46]:
# Turning them into a dataframe 
features_df = pd.DataFrame(gs.best_estimator_.feature_importances_, X_train_df.columns, columns = ['Importance'])

In [52]:
# Organizing them by score so that I can see the 20 most important
features_df.sort_values(by='Importance', ascending=False).head(20)

Unnamed: 0,Importance
and0,0.523197
education,0.162969
students,0.060944
university,0.050685
job,0.04582
career,0.043741
my,0.035519
higher,0.017246
what,0.012424
universities,0.011926
