## RandomForestClassifier

In [1]:
import pandas as pd
import numpy as np
#Used for pretty printing
from pprint import pprint


In [2]:
df = pd.read_csv("Data/headlinesViews.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,title,homesection,publishdate,pageviews,popularity
0,18976,Fem danskere i Touren: Sunweb udtager Søren Kragh,Cykling,2018-07-02 17:41:01 UTC,1499,0
1,31594,Klæstrup gift i moské: Jeg er så lykkelig,Danske kendte,2018-08-13 17:40:39 UTC,227329,1
2,10062,Teenager druknet efter skub i leg: Nu er kamme...,112,2018-06-02 09:57:02 UTC,85556,1
3,23137,It-fejl i europæisk flyveplansystem er rettet,Samfund,2018-04-04 00:10:44 UTC,1346,0
4,7639,Voldtaget to timer på loftet over café: - Jeg ...,112,2018-08-06 20:42:12 UTC,303642,1


In [3]:
df.shape

(31518, 6)

In [4]:
df.popularity.value_counts()

1    15799
0    15719
Name: popularity, dtype: int64

In [5]:
df.drop(["Unnamed: 0", "publishdate","homesection","pageviews"], axis=1, inplace=True)
df.head()

Unnamed: 0,title,popularity
0,Fem danskere i Touren: Sunweb udtager Søren Kragh,0
1,Klæstrup gift i moské: Jeg er så lykkelig,1
2,Teenager druknet efter skub i leg: Nu er kamme...,1
3,It-fejl i europæisk flyveplansystem er rettet,0
4,Voldtaget to timer på loftet over café: - Jeg ...,1


In [6]:
# Make everything lowercase
df["title"] = df["title"].str.lower()
df

Unnamed: 0,title,popularity
0,fem danskere i touren: sunweb udtager søren kragh,0
1,klæstrup gift i moské: jeg er så lykkelig,1
2,teenager druknet efter skub i leg: nu er kamme...,1
3,it-fejl i europæisk flyveplansystem er rettet,0
4,voldtaget to timer på loftet over café: - jeg ...,1
...,...,...
31513,helt til hest: politiet mangler borgernes hjæl...,0
31514,kæmpe ballade under pokalfinale: politiet brug...,1
31515,lavede sexbånd med kardashian: jeg er videre,0
31516,politiet bekræfter: nu skal ronaldo afhøres,1


In [7]:
# Remove all punctuation ()
df['title'] = df['title'].str.replace('[^\w\s]','')
df['title'].head()

0     fem danskere i touren sunweb udtager søren kragh
1             klæstrup gift i moské jeg er så lykkelig
2    teenager druknet efter skub i leg nu er kammer...
3         itfejl i europæisk flyveplansystem er rettet
4    voldtaget to timer på loftet over café  jeg vi...
Name: title, dtype: object

In [8]:
import nltk 
from nltk.corpus import stopwords
#nltk.download('stopwords')
stop = stopwords.words('danish')
print(stop)


['og', 'i', 'jeg', 'det', 'at', 'en', 'den', 'til', 'er', 'som', 'på', 'de', 'med', 'han', 'af', 'for', 'ikke', 'der', 'var', 'mig', 'sig', 'men', 'et', 'har', 'om', 'vi', 'min', 'havde', 'ham', 'hun', 'nu', 'over', 'da', 'fra', 'du', 'ud', 'sin', 'dem', 'os', 'op', 'man', 'hans', 'hvor', 'eller', 'hvad', 'skal', 'selv', 'her', 'alle', 'vil', 'blev', 'kunne', 'ind', 'når', 'være', 'dog', 'noget', 'ville', 'jo', 'deres', 'efter', 'ned', 'skulle', 'denne', 'end', 'dette', 'mit', 'også', 'under', 'have', 'dig', 'anden', 'hende', 'mine', 'alt', 'meget', 'sit', 'sine', 'vor', 'mod', 'disse', 'hvis', 'din', 'nogle', 'hos', 'blive', 'mange', 'ad', 'bliver', 'hendes', 'været', 'thi', 'jer', 'sådan']


In [9]:
# Show amount of stop words
df['stopwords'] = df['title'].apply(lambda x: len([x for x in x.split() if x in stop]))
df[['title','stopwords']].head()

Unnamed: 0,title,stopwords
0,fem danskere i touren sunweb udtager søren kragh,1
1,klæstrup gift i moské jeg er så lykkelig,3
2,teenager druknet efter skub i leg nu er kammer...,4
3,itfejl i europæisk flyveplansystem er rettet,2
4,voldtaget to timer på loftet over café jeg vi...,8


In [10]:
# Remove stop words
df['title'] = df['title'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df['title'].head()

0     fem danskere touren sunweb udtager søren kragh
1                    klæstrup gift moské så lykkelig
2          teenager druknet skub leg kammerat sigtet
3            itfejl europæisk flyveplansystem rettet
4    voldtaget to timer loftet café straffe fortjent
Name: title, dtype: object

In [11]:
# Get the 10 most common words
### Should this be all words above X count? Or something else?
freq = pd.Series(' '.join(df['title']).split()).value_counts()[:10]
freq

dansk      1380
kan         973
mand        854
så          781
ny          774
får         720
ved         618
år          612
danmark     599
danske      595
dtype: int64

In [12]:
# Remove the 10 most common words as they're not useful when classifying our data 
# (This allows us to catch some additional "stop words" not in our stop word library)
freq = list(freq.index)
df['title'] = df['title'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
df['title'].head()

0     fem danskere touren sunweb udtager søren kragh
1                       klæstrup gift moské lykkelig
2          teenager druknet skub leg kammerat sigtet
3            itfejl europæisk flyveplansystem rettet
4    voldtaget to timer loftet café straffe fortjent
Name: title, dtype: object

In [13]:
# Get the 10 least common words
### Should this be all words that are counted only once? Or something else?

freq = pd.Series(' '.join(df['title']).split()).value_counts()[-10:]
freq

heksejægere     1
cremet          1
mæglerbrøler    1
relativt        1
finalestart     1
rippet          1
betonblok       1
geislers        1
majkens         1
købsprisen      1
dtype: int64

In [14]:
# Remove the rarest words
freq = list(freq.index)
df['title'] = df['title'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
df['title'].head()

0     fem danskere touren sunweb udtager søren kragh
1                       klæstrup gift moské lykkelig
2          teenager druknet skub leg kammerat sigtet
3            itfejl europæisk flyveplansystem rettet
4    voldtaget to timer loftet café straffe fortjent
Name: title, dtype: object

# NLP

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.title, df.popularity, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((23638,), (7880,), (23638,), (7880,))

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect_new = TfidfVectorizer(min_df=5).fit(X_train)
tfid_x_train = vect_new.transform(X_train)
tfid_x_test = vect_new.transform(X_test)

In [17]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state = 42)
print('Parameters currently in use:\n')
pprint(rf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [18]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 1, stop = 500, num = 11)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(2, 50, num = 10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf
              }

pprint(random_grid)


{'max_depth': [2, 7, 12, 18, 23, 28, 34, 39, 44, 50],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [1, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500]}


In [19]:
from sklearn.model_selection import RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(tfid_x_train, y_train)


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  2.2min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [20]:
rf_random.best_params_


{'n_estimators': 500,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 12}

In [21]:
rf_random.best_score_

0.6433287773255773

In [22]:
from sklearn.metrics import accuracy_score
base_model = RandomForestClassifier(random_state = 42)
base_model.fit(tfid_x_train, y_train)
base_pred = base_model.predict(tfid_x_test)
base_accuracy = accuracy_score(base_pred, y_test)
base_accuracy

0.6459390862944162

In [23]:
best_random = rf_random.best_estimator_
best_pred_train = best_random.predict(tfid_x_train)
best_pred_test = best_random.predict(tfid_x_test)
print("Random Forest Tweaked")
print("Training set score: {:.3f}".format(accuracy_score(best_pred_train, y_train)))
print("Test set score: {:.3f}".format(accuracy_score(best_pred_test, y_test)))

Random Forest Tweaked
Training set score: 0.683
Test set score: 0.647


In [25]:
print('Improvement of {:0.2f}%.'.format( 100 * (accuracy_score(best_pred_test, y_test) - base_accuracy) / base_accuracy))

Improvement of 0.12%.


In [None]:
## Basically RandomForestClassifier is bad so we won't be using that. 