#  Import statements

In [6]:
#imports
import pandas as pd
df = pd.read_csv('newdatasetfinal.csv')
df.head(5)

Unnamed: 0,title,NewCategory
0,Bitcoin is down 60 percent this year. Here's w...,Business & Finance
1,6 health problems marijuana could treat better...,Crime
2,9 charts that explain the history of global we...,Business & Finance
3,Remember when legal marijuana was going to sen...,Crime
4,Obamacare succeeded for one simple reason: it'...,Technology & Health


In [7]:
from io import StringIO
col = ['NewCategory', 'title']
df = df[col]
df = df[pd.notnull(df['title'])]
df.columns = ['NewCategory', 'title']
df['category_id'] = df['NewCategory'].factorize()[0]
category_id_df = df[['NewCategory', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'NewCategory']].values)
df.head()

Unnamed: 0,NewCategory,title,category_id
0,Business & Finance,Bitcoin is down 60 percent this year. Here's w...,0
1,Crime,6 health problems marijuana could treat better...,1
2,Business & Finance,9 charts that explain the history of global we...,0
3,Crime,Remember when legal marijuana was going to sen...,1
4,Technology & Health,Obamacare succeeded for one simple reason: it'...,2


# Creating new features

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df.title).toarray()
labels = df.category_id
features.shape

(16798, 5527)

# Spiltting the Data into Training and Testing

In [9]:
#Training and Testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features,labels, test_size = 0.2, random_state = 0)

# Training Random Forest Classifier

In [10]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

classifier = RandomForestClassifier(n_estimators=100, random_state=42)  
clf = classifier.fit(X_train, y_train)


y_pred = classifier.predict(X_test) 

In [11]:
#5_cross_validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv=5)

In [12]:
scores.mean()

0.8092731737627588

# Hyperparameter Tuning

In [13]:
#n_estimators = 1000
classifier = RandomForestClassifier(n_estimators=1000, random_state=42)  
clf = classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

# Performance Metric

1. Accuracy

In [14]:
from sklearn.metrics import accuracy_score 
acc = accuracy_score(y_test, y_pred)
print("Accuracy of Random Forest model:", acc*100)

Accuracy of Decision tree model: 81.875


2. F1 Score

In [16]:
from sklearn.metrics import f1_score
print("F1 Score for all the categories:")
f1_score(y_test, y_pred, average=None)

F1 Score for all the categories:


array([0.83137255, 0.71682627, 0.81831395, 0.85182049, 0.8058489 ])

3. Confusion Matrix

In [17]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 424,    8,   40,   35,   19],
       [  15,  262,   19,   61,   33],
       [  14,   11,  563,   40,   32],
       [  23,   48,   56, 1006,   53],
       [  18,   12,   38,   34,  496]], dtype=int64)

4. Precision-Recall

In [18]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.81      0.83       526
           1       0.77      0.67      0.72       390
           2       0.79      0.85      0.82       660
           3       0.86      0.85      0.85      1186
           4       0.78      0.83      0.81       598

   micro avg       0.82      0.82      0.82      3360
   macro avg       0.81      0.80      0.80      3360
weighted avg       0.82      0.82      0.82      3360



References
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
https://medium.com/all-things-ai/in-depth-parameter-tuning-for-random-forest-d67bb7e920d