# Predicting Reddit comments using Random Forests and Count Vectorizer 

In [3]:
import requests
import json
import time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.stem import WordNetLemmatizer
import regex as re
from bs4 import BeautifulSoup 
from sklearn.linear_model import LogisticRegression, RidgeClassifier, LassoCV, Lasso
from sklearn import linear_model
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

In [4]:
#import in csv file with all the cleaned posts and create a dataframe
df=pd.read_csv('./files/cleanposts.csv')

In [5]:
#check for missing titles and replace with null
#only 1 exists ok to change to "null"
df['title']=df['title'].replace(np.nan,'null')
df['title'].isnull().sum()

0

# Train Test Split

In [6]:
#defind variables
X=df['title']  
y=df['subreddit']

In [7]:
#train test split
X_train, X_test, y_train, y_test= train_test_split(X,y,stratify=y,random_state=42)

In [8]:
#double check for missing values
X_train=X_train.replace(np.nan,'null')
X_train.isnull().sum()

0

# Train Models

In [9]:
cv=CountVectorizer()   
model=MultinomialNB()

pipe=Pipeline([('cv',cv),('model',model)])

In [10]:
#initial gridsearch untuned with countvectorizer
params={}
gs=GridSearchCV(pipe, param_grid=params)
gs.fit(X_train, y_train)
gs.best_score_
gs_test=gs.score(X_test,y_test)
print('GS Train Score : ',gs.best_score_)
print('GS TestScore   : ',gs_test)
print('Optimal Param  : ',gs.best_params_)

GS Train Score :  0.8615548455804047
GS TestScore   :  0.8757961783439491
Optimal Param  :  {}


In [11]:
#tuning gridsearch by stop words and max features, ngrams
params={'cv__stop_words':[None,'english'],
        'cv__max_features':[1900,3000],
        'cv__ngram_range':[(1,1),(1,2)],
        'cv__max_df':[0.5, 0.7, 1.0]}
gs=GridSearchCV(pipe, param_grid=params)
gs.fit(X_train, y_train)
gs.best_score_
gs_test=gs.score(X_test,y_test)
print('GS Train Score : ',gs.best_score_)
print('GS TestScore   : ',gs_test)
print('Optimal Param  : ',gs.best_params_)

GS Train Score :  0.8722044728434505
GS TestScore   :  0.8630573248407644
Optimal Param  :  {'cv__max_df': 0.5, 'cv__max_features': 3000, 'cv__ngram_range': (1, 1), 'cv__stop_words': 'english'}


### Running a Multinomial model, using Count Vectorizer, there were no changes when applying parameters, such as stop words or max features as part of the count vectorizer

In [12]:
#using BernoulliNB
cv=CountVectorizer(binary=True)   #binary=True
model=BernoulliNB()

pipeb=Pipeline([('cv',cv),('model',model)])

In [13]:
#initial gridsearch untuned
params={}
gs=GridSearchCV(pipeb, param_grid=params)
gs.fit(X_train, y_train)
gs.best_score_
gs_test=gs.score(X_test,y_test)
print('GS Train Score : ',gs.best_score_)
print('GS Test Score  : ',gs_test)
print('Optimal Param  : ',gs.best_params_)

GS Train Score :  0.8668796592119276
GS Test Score  :  0.8503184713375797
Optimal Param  :  {}


In [14]:
#tuning gridsearch by stop words and max features
params={'cv__stop_words':[None,'english'],
        'cv__max_features':[1500,2000,2500]}
gs=GridSearchCV(pipeb, param_grid=params)
gs.fit(X_train, y_train)
gs.best_score_
gs_test=gs.score(X_test,y_test)
print('GS Train Score : ',gs.best_score_)
print('GS Test Score : ',gs_test)
print('Optimal Param  : ',gs.best_params_)

GS Train Score :  0.8668796592119276
GS Test Score :  0.8789808917197452
Optimal Param  :  {'cv__max_features': 2500, 'cv__stop_words': None}


In [15]:
#tuning gridsearch by stop words and max features, ngrams
params={'cv__stop_words':[None,'english'],
        'cv__max_features':[1500,2000,2500],
        'cv__ngram_range':[(1,1),(1,2)],
        'cv__max_df':[0.5, 0.7, 1.0]}
gs=GridSearchCV(pipeb, param_grid=params)
gs.fit(X_train, y_train)
gs.best_score_
gs_test=gs.score(X_test,y_test)
print('GS Train Score : ',gs.best_score_)
print('GS Test Score  : ',gs_test)
print('Optimal Param  : ',gs.best_params_)

GS Train Score :  0.8668796592119276
GS Test Score  :  0.8789808917197452
Optimal Param  :  {'cv__max_df': 0.5, 'cv__max_features': 2500, 'cv__ngram_range': (1, 1), 'cv__stop_words': None}


### Using Count Vectorizer and BernoulliNB model, with similar parameters as MultinomialNB, increased the accuracy by 1%.  Still needs some work

# Using TF-IDF and Hash Models

In [16]:
#create and fit model
#remove english stop words
t_vec = TfidfVectorizer(stop_words = 'english')
h_vec = HashingVectorizer(stop_words = 'english')

#fit & transform to tfidf model
t_vec_train = t_vec.fit_transform(X_train, y_train)
t_vec_test = t_vec.transform(X_test)

#fit & transform to hash model
h_vec_train = h_vec.fit_transform(X_train, y_train)
h_vec_test  = h_vec.transform(X_test)

In [17]:
# of features for the tfidf model
len(t_vec_train.toarray().T)

2702

In [18]:
# of features for the hash model
len(h_vec_train.toarray().T)

1048576

In [19]:
# model and performance of the TFIDF on lr with ridge penalty
lr = LogisticRegression(penalty = 'l2')

lr.fit(t_vec_train, y_train)
print('Ridge Log Reg Train Score : ', lr.score(t_vec_train, y_train))
print('Ridge Log Reg Test Score  : ',lr.score(t_vec_test, y_test))
    

Ridge Log Reg Train Score :  0.9914802981895634
Ridge Log Reg Test Score  :  0.8630573248407644


In [20]:
# model and performance of the TFIDF on lr with lasso penalty
lr = LogisticRegression(penalty = 'l1')

lr.fit(t_vec_train, y_train)
print('Lasso Log Reg Train Score : ', lr.score(t_vec_train, y_train))
print('Lasso Log Reg Test Score  : ',lr.score(t_vec_test, y_test))


Lasso Log Reg Train Score :  0.8104366347177849
Lasso Log Reg Test Score  :  0.7961783439490446


In [21]:
# model and performance of the hash on lr with ridge penalty
lr = LogisticRegression(penalty = 'l2')

lr.fit(h_vec_train, y_train)
print('Ridge Hash Train Score : ', lr.score(h_vec_train, y_train))
print('Ridge Hash Test Score  : ',lr.score(h_vec_test, y_test))
    

Ridge Hash Train Score :  0.9829605963791267
Ridge Hash Test Score  :  0.8407643312101911


In [22]:
# model and performance of the hash on lr with lasso penalty
lr = LogisticRegression(penalty = 'l1')

lr.fit(h_vec_train, y_train)
print('Lasso Hash Train Score : ', lr.score(h_vec_train, y_train))
print('Lasso Hash Test Score  : ',lr.score(h_vec_test, y_test))
    

Lasso Hash Train Score :  0.8125665601703941
Lasso Hash Test Score  :  0.8057324840764332


### Using TF-IDF and Hash, they generally did worse as far as scoring, with the exception of using Ridge.  This jumped up quite high, but overfitting between the train and test data sets.  So far this is the best model.  To further check out this model, time to run it through Grid Search to further optimize parameters

In [23]:
estimators = [("tf_idf", TfidfVectorizer()), 
              ("ridge", linear_model.RidgeClassifier())]
model = Pipeline(estimators)

In [24]:
#tfidf with ridge stop words removed and optomized
params = {"ridge__alpha":[1, 3, 5],               #regularization param
          "tf_idf__min_df": [1, 3],               #min count of words allowed
          "tf_idf__ngram_range": [(1,7), (1,8)],  #1-grams or 2-grams
          "tf_idf__stop_words": [None, "english"],#use stopwords or not
          "tf_idf__use_idf":[True, False],        #whether to scale columns or just leave normalized bag of words.
          "tf_idf__max_df": [0.2, 0.3, 0.4]}      #max count of words allowed
gs=GridSearchCV(estimator=model,param_grid=params)
gs.fit(X_train, y_train)
gs.best_score_
gs_test=gs.score(X_test,y_test)
print('GS Train TFIDF Ridge Score  : ',gs.best_score_)
print('GS Test TFIDF Ridge Score   : ',gs_test)
print('Optimal Param TFIDF Ridge   : ',gs.best_params_)

GS Train TFIDF Ridge Score  :  0.8647497337593184
GS Test TFIDF Ridge Score   :  0.8726114649681529
Optimal Param TFIDF Ridge   :  {'ridge__alpha': 1, 'tf_idf__max_df': 0.2, 'tf_idf__min_df': 1, 'tf_idf__ngram_range': (1, 7), 'tf_idf__stop_words': 'english', 'tf_idf__use_idf': True}


### Using Grid Search, TF_IDF with Ridge penalty, the model came back to earth and more in line with the Count Vectorizations.  The model became underfit instead of the inital overfit, but the score dropped a hair

In [25]:
#run through cv model, change ngram range from (1,1) to (1,2)
cvec=CountVectorizer(ngram_range=(1,2),stop_words='english')

# Fit & transform the vectorizer on our corpus
data_vector = cvec.fit_transform(X_train)
#data_vector_test = cvec.transform(y_test)


#create df to view data
df_cvec_1  = pd.DataFrame(data_vector.toarray(),
                     columns=cvec.get_feature_names())

In [26]:
#create random forest model
rfc=RandomForestClassifier(random_state=42)

param_grid = { 
    'n_estimators': [900, 1000],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [10,11,12],
    'criterion' :['gini', 'entropy']}

gs = GridSearchCV(estimator=rfc, param_grid=param_grid)
gs.fit(data_vector, y_train)
gs.best_score_
print('GS Train Score : ',gs.best_score_)
print('Optimal Param  : ',gs.best_params_)

GS Train Score :  0.8487752928647497
Optimal Param  :  {'criterion': 'entropy', 'max_depth': 11, 'max_features': 'log2', 'n_estimators': 900}


### Running the same information through a Random Forest model decreases the score a bit

### We want to predict a binary variable - whether the number of comments was low or high. Computing the median number of comments and create a new binary variable that is true when the number of comments is high (above the median)

In [27]:
#change num_comments field from int to float
df['num_comments']=df['num_comments'].astype(int).astype(float)
df['num_comments'].dtype

dtype('float64')

In [28]:
#find mean of number of comments
mean_comments=np.mean(df['num_comments'])
np.mean(df['num_comments'])
#mean number of comments is 33.06

32.25778132482043

In [29]:
median_quartile_comments=np.percentile(df.num_comments, 50) 
bottom_quartile_comments=np.percentile(df.num_comments, 25) 
upper_quartile_comments=np.percentile(df.num_comments, 75) 

In [30]:
df['num_comments'].describe()

count    1253.000000
mean       32.257781
std        71.814044
min         0.000000
25%         5.000000
50%        12.000000
75%        30.000000
max      1015.000000
Name: num_comments, dtype: float64

In [31]:
#assign 1 to comments over 33.06 or 0 to posts under 33.06
df['comment_check_mean'] = [0 if num_comment_list < mean_comments 
                       else 1 for num_comment_list in df['num_comments']]

#25th percentile
df['comment_check_25'] = [0 if num_comment_list < bottom_quartile_comments  
                       else 1 for num_comment_list in df['num_comments']]

#75th percentile
df['comment_check_75'] = [0 if num_comment_list < upper_quartile_comments  
                       else 1 for num_comment_list in df['num_comments']]


In [32]:
#number of high (283) vs low (969) comments compared to mean of 33.06
df['comment_check_mean'].value_counts()

0    965
1    288
Name: comment_check_mean, dtype: int64

In [33]:
#number of comments in 25% quartile of num_comments
df['comment_check_25'].value_counts()

1    945
0    308
Name: comment_check_25, dtype: int64

In [34]:
#number of comments in 75% quartile of num_comments
df['comment_check_75'].value_counts()

0    936
1    317
Name: comment_check_75, dtype: int64

In [35]:
#baseline accuracy of model using subreddit
df['target'].value_counts(normalize=True)
#this means 50% of posts are related to teh subreddit starwars

1    0.500399
0    0.499601
Name: target, dtype: float64

In [36]:
#baseline accuracy of model using mean of comments
df['comment_check_mean'].value_counts(normalize=True)
#this means 77.4% of posts have less than the mean of comments

0    0.770152
1    0.229848
Name: comment_check_mean, dtype: float64

In [37]:
#baseline accuracy of model using 25th perncentile of comments
df['comment_check_25'].value_counts(normalize=True)
#this means 75.0% of posts have less than t?he 25th percentile of number of comments

1    0.75419
0    0.24581
Name: comment_check_25, dtype: float64

In [38]:
#baseline accuracy of model using 75th perncentile of comments
df['comment_check_75'].value_counts(normalize=True)
#this means 74.9% of posts have less than the 75th percentile of number of comments

0    0.747007
1    0.252993
Name: comment_check_75, dtype: float64

### Create a Random Forest model to predict High/Low number of comments usinging the subreddit as a feature

In [39]:
#set varaiables, new cross train split

X=df['title']
y=df['comment_check_mean']

X_train, X_test, y_train, y_test=train_test_split(X,y,random_state=42,stratify=y)

In [40]:
df.head()

Unnamed: 0,approved_at_utc,approved_by,archived,author,author_cakeday,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,...,user_reports,view_count,visited,whitelist_status,wls,target,time_passed,comment_check_mean,comment_check_25,comment_check_75
0,,,False,Yunners,,,21 thisFlairHasText,[],,Jedi Knight,...,[],,False,all_ads,6,1,3.5 months ago,1,1,1
1,,,False,Yunners,,,21 thisFlairHasText,[],,Jedi Knight,...,[],,False,all_ads,6,1,1.4333333333333333 months ago,1,1,1
2,,,False,getridofwires,True,,42,[],,,...,[],,False,all_ads,6,1,1.3 months ago,1,1,1
3,,,False,noslowsongs,,,,[],,,...,[],,False,all_ads,6,1,1.3 months ago,1,1,1
4,,,False,aditseth03,,,9,[],,,...,[],,False,all_ads,6,1,1.3 months ago,1,1,1


In [41]:
#countvectorizer on subreddit and title to determine if # of comments above/below mean

cvec=CountVectorizer(stop_words='english')

# Fit & transform the vectorizer on our corpus
data_vector = cvec.fit_transform(X_train)
#data_vector_test = cvec.transform(y_test)

#create df to view data
df_cvec  = pd.DataFrame(data_vector.toarray(),
                    columns=cvec.get_feature_names())

In [42]:
df_cvec.head()

Unnamed: 0,10,100th,11001001,12,129,13,13m,13th,14,15,...,younglings,youre,youth,youve,yuuzhan,yvanquinet,ywings,zahn,zefram,ziyal
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
#create random forest model
rfc=RandomForestClassifier(random_state=42)

param_grid = { 
    'n_estimators': [200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']}

gs = GridSearchCV(estimator=rfc, param_grid=param_grid)
gs.fit(data_vector, y_train)
gs.best_score_
print('GS Train Score : ',gs.best_score_)
print('Optimal Param  : ',gs.best_params_)

GS Train Score :  0.7699680511182109
Optimal Param  :  {'criterion': 'gini', 'max_depth': 4, 'max_features': 'auto', 'n_estimators': 200}


### Using Grid Search and Random Forest, the model performed worse than any of the others, by about 7%

## Examining other features to see if they are helpful in modeling

In [44]:
#create list of top words in titles COMBINED
word_count_df = pd.DataFrame( columns = ['Word','Count'])

for col in df_cvec.columns:
    word_count_df.loc[len(word_count_df)] = [col, df_cvec[col].sum()]
    

In [45]:
word_count_df.sort_values('Count', ascending = False).head(20)

Unnamed: 0,Word,Count
2307,star,230
2523,trek,156
2673,wars,90
1607,new,48
1374,like,45
1269,just,41
2491,tng,41
2146,series,31
721,ds9,29
2252,solo,29


In [53]:
#copy columns episode and new to df
df['episode']=df_cvec['episode'].copy()
df['new']=df_cvec['new'].copy()

In [54]:
#fill missing values on posts without episode with a 0, same with new
df['episode']=df['episode'].replace(np.nan,'0').astype(int)
df['new']=df['new'].replace(np.nan,'0').astype(int)

In [55]:
df['episode'].value_counts()

0    1229
1      23
2       1
Name: episode, dtype: int64

In [56]:
#fix outlier in episode column of 2.  make it a 0
df['episode']=df['episode'].replace(2,0)

In [57]:
#set new variables and new train test split
X=df['subreddit']
y=df['episode']

X_train, X_test, y_train, y_test=train_test_split(X,y,random_state=42,stratify=y)

In [58]:
#run through cv model, change ngram range from (1,1) to (1,2)
cvec=CountVectorizer(ngram_range=(1,2),stop_words='english')

# Fit & transform the vectorizer on our corpus
data_vector = cvec.fit_transform(X_train)
#data_vector_test = cvec.transform(y_test)

#create df to view data
df_cvec_1  = pd.DataFrame(data_vector.toarray(),
                     columns=cvec.get_feature_names())

In [59]:
#put cv model into random forest
rf=RandomForestClassifier(random_state=42)

param_grid = { 
    'n_estimators': [200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']}

gs = GridSearchCV(estimator=rf, param_grid=param_grid)
gs.fit(data_vector, y_train)
gs.best_score_
print('GS Train Score : ',gs.best_score_)
print('Optimal Param  : ',gs.best_params_)


GS Train Score :  0.9818956336528222
Optimal Param  :  {'criterion': 'gini', 'max_depth': 4, 'max_features': 'auto', 'n_estimators': 200}


### Using the word Episode to see if it fits into the appropriate subreddit, has quite good success to match to the Star Wars Reddit

In [60]:
#set new variables and new train test split
X=df['title']
y=df['episode']

X_train, X_test, y_train, y_test=train_test_split(X,y,random_state=42,stratify=y)

In [61]:
#run through cv model, change ngram range from (1,1) to (1,2)
cvec=CountVectorizer(ngram_range=(1,2),stop_words='english')

# Fit & transform the vectorizer on our corpus
data_vector = cvec.fit_transform(X_train)
#data_vector_test = cvec.transform(y_test)

#create df to view data
df_cvec_1  = pd.DataFrame(data_vector.toarray(),
                     columns=cvec.get_feature_names())

In [62]:
#put cv model into random forest
rf=RandomForestClassifier(random_state=42)

param_grid = { 
    'n_estimators': [200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']}

gs = GridSearchCV(estimator=rf, param_grid=param_grid)
gs.fit(data_vector, y_train)
gs.best_score_
print('GS Train Score : ',gs.best_score_)
print('Optimal Param  : ',gs.best_params_)


GS Train Score :  0.9818956336528222
Optimal Param  :  {'criterion': 'gini', 'max_depth': 4, 'max_features': 'auto', 'n_estimators': 200}


### Using the word 'episode' in the title of the post, shows a very high score, so there is a strong correlation between the two.  Now to try episode to predict reddit 

In [63]:
#set new variables and new train test split
X=df['subreddit']
y=df['episode']


X_train, X_test, y_train, y_test=train_test_split(X,y,random_state=42,stratify=y)

In [64]:
#run through cv model, change ngram range from (1,1) to (1,2)
cvec=CountVectorizer(ngram_range=(1,2),stop_words='english')

# Fit & transform the vectorizer on our corpus
data_vector = cvec.fit_transform(X_train)
#data_vector_test = cvec.transform(y_test)

#create df to view data
df_cvec_1  = pd.DataFrame(data_vector.toarray(),
                     columns=cvec.get_feature_names())

In [65]:
#put cv model into random forest
rf=RandomForestClassifier(random_state=42)

param_grid = { 
    'n_estimators': [200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']}

gs = GridSearchCV(estimator=rf, param_grid=param_grid)
gs.fit(data_vector, y_train)
gs.best_score_
print('GS Train Score : ',gs.best_score_)
print('Optimal Param  : ',gs.best_params_)

GS Train Score :  0.9818956336528222
Optimal Param  :  {'criterion': 'gini', 'max_depth': 4, 'max_features': 'auto', 'n_estimators': 200}


### The word Episode appears to be able to correctly predict the Star Wards Reddit 97% of the time, which is surprising given subject matter expertise.  The Star Wars movies are each an episode, but there are also countless Star Trek TV episodes.  One would think that episode would be more closely tied to Star Trek

In [72]:
df=df[df['new'] != 2]

In [76]:
#set new variables and new train test split
X=df['title']
y=df['new']

X_train, X_test, y_train, y_test=train_test_split(X,y,random_state=42,stratify=y)

In [77]:
#run through cv model, change ngram range from (1,1) to (1,2)
cvec=CountVectorizer(ngram_range=(1,2),stop_words='english')

# Fit & transform the vectorizer on our corpus
data_vector = cvec.fit_transform(X_train)
#data_vector_test = cvec.transform(y_test)

#create df to view data
df_cvec_1  = pd.DataFrame(data_vector.toarray(),
                     columns=cvec.get_feature_names())

In [78]:
#put cv model into random forest
rf=RandomForestClassifier(random_state=42)

param_grid = { 
    'n_estimators': [200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']}

gs = GridSearchCV(estimator=rf, param_grid=param_grid)
gs.fit(data_vector, y_train)
gs.best_score_
print('GS Train Score : ',gs.best_score_)
print('Optimal Param  : ',gs.best_params_)

GS Train Score :  0.9627263045793397
Optimal Param  :  {'criterion': 'gini', 'max_depth': 4, 'max_features': 'auto', 'n_estimators': 200}


### Using the word "New" also has strong results to predict Star Wars based posts at 96%

In [79]:
#set new variables and new train test split
X=df['subreddit']
y=df['new']

X_train, X_test, y_train, y_test=train_test_split(X,y,random_state=42,stratify=y)

In [80]:
#run through cv model, change ngram range from (1,1) to (1,2)
cvec=CountVectorizer(ngram_range=(1,2),stop_words='english')

# Fit & transform the vectorizer on our corpus
data_vector = cvec.fit_transform(X_train)
#data_vector_test = cvec.transform(y_test)

#create df to view data
df_cvec_1  = pd.DataFrame(data_vector.toarray(),
                     columns=cvec.get_feature_names())

In [81]:
#put cv model into random forest
rf=RandomForestClassifier(random_state=42)

param_grid = { 
    'n_estimators': [1, 25],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [1,2,3],
    'criterion' :['gini', 'entropy']}

gs = GridSearchCV(estimator=rf, param_grid=param_grid)
gs.fit(data_vector, y_train)
gs.best_score_
print('GS Train Score : ',gs.best_score_)
print('Optimal Param  : ',gs.best_params_)


GS Train Score :  0.9627263045793397
Optimal Param  :  {'criterion': 'gini', 'max_depth': 1, 'max_features': 'auto', 'n_estimators': 1}


### Using the word "New" was able to predict the Star Wars subreddit 96.4% of the time

## Use cross-validation to evaluate the model

In [82]:
#cross val score of rf and episode

print('cross val',cross_val_score(rf, data_vector, y_train).mean())


cross val 0.9627294570290236


### Repeat the model-building process with a non-tree-based method.

In [83]:
#stop_words = list(ENGLISH_STOP_WORDS) + ['star','wars','trek']

In [84]:
X=df['title']
y=df['episode']

X_train, X_test, y_train, y_test=train_test_split(X,y,random_state=42,stratify=y)

In [85]:
estimators = [("tf_idf", TfidfVectorizer()), 
              ("ridge", linear_model.RidgeClassifier())]
model = Pipeline(estimators)

#tfidf with ridge stop words removed and optomized
params = {"ridge__alpha":[1, 3, 5],               #regularization param
          "tf_idf__min_df": [1, 3],               #min count of words allowed
          "tf_idf__ngram_range": [(1,2)],  #1-grams or 2-grams
          "tf_idf__stop_words": [None, "english"],#use stopwords or not
          "tf_idf__use_idf":[True, False],        #whether to scale columns or just leave normalized bag of words.
          "tf_idf__max_df": [0.2, 0.3, 0.4]}      #max count of words allowed
gs=GridSearchCV(estimator=model,param_grid=params)
gs.fit(X_train, y_train)
gs.best_score_
print('GS Train TFIDF Ridge Score  : ',gs.best_score_)
print('Optimal Param TFIDF Ridge   : ',gs.best_params_)

GS Train TFIDF Ridge Score  :  0.9818956336528222
Optimal Param TFIDF Ridge   :  {'ridge__alpha': 1, 'tf_idf__max_df': 0.2, 'tf_idf__min_df': 1, 'tf_idf__ngram_range': (1, 2), 'tf_idf__stop_words': None, 'tf_idf__use_idf': True}
