# Reddit Flare Classifier


In [52]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import pickle
from sklearn import linear_model
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support as score

## Reading data from CSV

In [3]:
data = pd.read_csv('data/reddit_01.csv')
data.head()

Unnamed: 0,flare,over_18,score,awards,creation_time,corpous
0,Non-Political,0.0,2.75642,6.863544,0.990808,Here to share my ballpoint pen drawing with my...
1,Politics,0.0,1.07529,0.0,1.007463,John Oliver slams Arnab Goswami and Hotstar
2,Coronavirus,0.0,3.204018,0.0,1.008445,A deserted East Express Highway in Mumbai
3,Politics,0.0,1.177017,0.0,1.007097,Lesson for Delhi Police as Kolkata Police arre...
4,Non-Political,0.0,1.874788,0.0,0.914764,Former President APJ Abdul Kalam passes away


In [4]:
data.dtypes

flare             object
over_18          float64
score            float64
awards           float64
creation_time    float64
corpous           object
dtype: object

In [5]:
X_num = data[data.columns.difference(['flare','corpous','awards','creation_time'] )]

In [6]:
X_num.head()

Unnamed: 0,over_18,score
0,0.0,2.75642
1,0.0,1.07529
2,0.0,3.204018
3,0.0,1.177017
4,0.0,1.874788


In [7]:
X_num.dtypes

over_18    float64
score      float64
dtype: object

In [8]:
flares =list(set(data['flare']))
flares

['Politics',
 'Food',
 'Sports',
 'CAA-NRC-NPR',
 'Policy/Economy',
 'Photography',
 'AskIndia',
 'Scheduled',
 'Science/Technology',
 'Business/Finance',
 'Non-Political',
 'Coronavirus']

In [9]:
#X,Y_text=data.loc[:, data.columns != 'flare'],data['flare']

In [10]:
X_text,Y_text=data.corpous.values,data.flare.values

In [11]:
X_text.shape

(1685,)

In [12]:
Y = [flares.index(s) for s in Y_text]
Y

[10,
 0,
 11,
 0,
 10,
 5,
 8,
 0,
 10,
 0,
 0,
 8,
 6,
 6,
 0,
 10,
 0,
 10,
 9,
 10,
 10,
 0,
 0,
 9,
 0,
 10,
 10,
 10,
 8,
 4,
 10,
 10,
 10,
 10,
 0,
 6,
 1,
 10,
 10,
 11,
 0,
 10,
 8,
 10,
 6,
 10,
 10,
 0,
 10,
 0,
 10,
 10,
 0,
 0,
 10,
 10,
 0,
 9,
 0,
 10,
 10,
 10,
 10,
 10,
 10,
 9,
 2,
 6,
 0,
 10,
 10,
 0,
 9,
 10,
 0,
 11,
 11,
 4,
 0,
 0,
 10,
 0,
 10,
 11,
 2,
 0,
 10,
 0,
 10,
 0,
 10,
 0,
 10,
 10,
 10,
 11,
 10,
 10,
 10,
 0,
 2,
 2,
 10,
 10,
 10,
 10,
 0,
 0,
 0,
 0,
 10,
 10,
 10,
 0,
 6,
 10,
 9,
 10,
 6,
 10,
 10,
 0,
 10,
 10,
 11,
 11,
 0,
 10,
 10,
 0,
 11,
 10,
 10,
 11,
 0,
 0,
 10,
 11,
 0,
 10,
 10,
 6,
 11,
 10,
 4,
 0,
 10,
 10,
 0,
 11,
 5,
 0,
 0,
 11,
 11,
 11,
 10,
 0,
 10,
 11,
 6,
 6,
 10,
 0,
 10,
 10,
 10,
 10,
 0,
 10,
 11,
 10,
 6,
 6,
 10,
 5,
 10,
 0,
 10,
 10,
 2,
 10,
 2,
 10,
 10,
 0,
 0,
 11,
 11,
 5,
 10,
 10,
 10,
 11,
 11,
 11,
 0,
 0,
 11,
 10,
 10,
 5,
 0,
 0,
 10,
 10,
 10,
 0,
 0,
 10,
 10,
 11,
 10,
 10,
 4,
 6,
 10,
 0,
 0,
 0

In [13]:
count_vectorizer = CountVectorizer()
count_vectorizer.fit_transform(X_text)

freq_term_matrix = count_vectorizer.transform(X_text)
tfidf = TfidfTransformer(norm="l2")
tfidf.fit(freq_term_matrix)
tf_idf_matrix = tfidf.transform(freq_term_matrix)

## Saving object with pickle

In [14]:
with open("vocab.obj",'wb') as file:
    pickle.dump(count_vectorizer, file)

In [15]:
text_features=tf_idf_matrix.todense()
text_features

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

## Train-Test Split

In [16]:
X_train,X_test=np.c_[X_num[:1585].values,text_features[:1585]],np.c_[X_num[1585:].values,text_features[1585:]]

Y_train,Y_test=Y[:1585],Y[1585:]

In [17]:
X_num.values[1:8]

array([[0.        , 1.07529014],
       [0.        , 3.204018  ],
       [0.        , 1.17701696],
       [0.        , 1.87478758],
       [0.        , 2.980219  ],
       [0.        , 3.21908864],
       [0.        , 2.72025047]])

In [18]:
a = [1,3]
np.c_[a,a]

array([[1, 1],
       [3, 3]])

In [19]:
text_features.shape

(1685, 9706)

## Accuracy

In [42]:
logreg = linear_model.LogisticRegression(C=1e5)
model=logreg.fit(X_train,Y_train)
pred=model.predict(X_test)
accuracy_score(Y_test, pred)



0.64

In [21]:
with open("model.obj",'wb') as file:
    pickle.dump(model, file)

In [48]:
pred_one = model.predict(np.c_[0,1.9,tfidf.transform((count_vectorizer.transform(['"Doland, Dont do Press Conference..."',]))).todense()])
flares[int(pred_one)]

'Politics'

In [43]:
pred

array([ 0, 11,  0,  0, 11, 11,  6, 10,  6, 10, 11, 10,  6, 11,  0, 10, 11,
       11,  6, 11, 10,  0, 10, 11,  0,  6, 10, 11, 10, 10, 11, 10,  6, 10,
        1, 11,  6,  0,  0, 11, 10, 10, 11, 10, 10, 10, 10, 10, 11,  0,  0,
        6,  0, 11,  0, 11, 10, 10, 11,  6,  0, 11, 10,  6, 10, 11, 11, 11,
       11, 11, 11,  0, 11,  0, 11,  6,  6,  0, 11, 10, 10, 11,  0, 11,  0,
       11, 10, 10, 11, 11, 10, 10,  6,  0,  0, 10, 11, 11,  6,  0])

## Confusion matrix