In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from google.colab import drive
import re
%matplotlib inline

  import pandas.util.testing as tm


In [3]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pickle
data = pickle.load(open('/content/drive/My Drive/NewsClassifier/News_data_with_length','rb'))

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 196390 entries, 0 to 200852
Data columns (total 8 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   category                  196390 non-null  object
 1   headline                  196390 non-null  object
 2   authors                   196390 non-null  object
 3   short_description         196390 non-null  object
 4   headline_length           196390 non-null  int64 
 5   short_description_length  196390 non-null  int64 
 6   information               196390 non-null  object
 7   info_length               196390 non-null  int64 
dtypes: int64(3), object(5)
memory usage: 13.5+ MB


In [6]:
data.tail(2)

Unnamed: 0,category,headline,authors,short_description,headline_length,short_description_length,information,info_length
200851,SPORTS,Aldon Smith Arrested: 49ers Linebacker Busted ...,,CORRECTION: An earlier version of this story i...,8,20,Aldon Smith Arrested: 49ers Linebacker Busted ...,28
200852,SPORTS,Dwight Howard Rips Teammates After Magic Loss ...,,The five-time all-star center tore into his te...,9,19,Dwight Howard Rips Teammates After Magic Loss ...,28


In [0]:
df =data.drop(['headline','short_description','headline_length','short_description_length'],axis=1)

In [8]:
df.head(2)

Unnamed: 0,category,authors,information,info_length
0,CRIME,Melissa Jeltsen,There Were 2 Mass Shootings In Texas Last Week...,27
1,ENTERTAINMENT,Andy McDonald,Will Smith Joins Diplo And Nicky Jam For The 2...,20


### Preprocessing

In [9]:
df[df.info_length == max(df.info_length)]['information'].values

array(['Sunday Roundup. This week the nation watched as the #NeverTrump movement folded faster than one of the presumptive nominee\'s beachfront developments. As many tried to explain away Trump\'s reckless, racist extremism, a few put principle over party. The wife of former Republican Senator Bob Bennett, who died on May 4, revealed that her husband spent his dying hours reaching out to Muslims. "He would go to people with the hijab [on] and tell them he was glad they were in America," she told the Daily Beast. "He wanted to apologize on behalf of the Republican Party." In the U.K., Prime Minister David Cameron called Trump\'s proposal to ban Muslims from entering the U.S., "divisive, stupid and wrong." Trump\'s reply was that he didn\'t think he and Cameron would "have a very good relationship." The press is also doing its part to whitewash extremism. The New York Times called Trump\'s racism "a reductive approach to ethnicity," and said Trump\'s attitude toward women is "complex" a

In [0]:
from nltk.stem import PorterStemmer
from nltk.tokenize import TreebankWordTokenizer
stemmer = PorterStemmer()
tokenizer = TreebankWordTokenizer()

In [0]:
def clean_text(text):
  text = re.sub(r"@[A-Za-z0-9]+"," ",text) # remove @usernames
  text = re.sub(r"https?://[A-Za-z0-9./]+"," ",text) #remove https links
  text = re.sub(r"[^A-Za-z.!?']"," ",text)  # keep only letters and few punctuations
  text = re.sub(r" +"," ",text) # remove extra spaces
  text = text.lower()
  return text

In [0]:
df['cleaned_info'] = df.information.apply(clean_text)

In [13]:
sentence =df[df.info_length == max(df.info_length)]['cleaned_info'].values
sentence

array(["sunday roundup. this week the nation watched as the nevertrump movement folded faster than one of the presumptive nominee's beachfront developments. as many tried to explain away trump's reckless racist extremism a few put principle over party. the wife of former republican senator bob bennett who died on may revealed that her husband spent his dying hours reaching out to muslims. he would go to people with the hijab on and tell them he was glad they were in america she told the daily beast. he wanted to apologize on behalf of the republican party. in the u.k. prime minister david cameron called trump's proposal to ban muslims from entering the u.s. divisive stupid and wrong. trump's reply was that he didn't think he and cameron would have a very good relationship. the press is also doing its part to whitewash extremism. the new york times called trump's racism a reductive approach to ethnicity and said trump's attitude toward women is complex and defies simple categorization a

In [0]:
def process(text):
  words = tokenizer.tokenize(text)
  words = [stemmer.stem(word) for word in words]
  return ' '.join(words)


In [0]:
df['processed_info'] = df.cleaned_info.apply(process)

In [16]:
df[df.info_length == max(df.info_length)]['processed_info'].values

array(["sunday roundup. thi week the nation watch as the nevertrump movement fold faster than one of the presumpt nomine 's beachfront developments. as mani tri to explain away trump 's reckless racist extrem a few put principl over party. the wife of former republican senat bob bennett who die on may reveal that her husband spent hi die hour reach out to muslims. he would go to peopl with the hijab on and tell them he wa glad they were in america she told the daili beast. he want to apolog on behalf of the republican party. in the u.k. prime minist david cameron call trump 's propos to ban muslim from enter the u.s. divis stupid and wrong. trump 's repli wa that he did n't think he and cameron would have a veri good relationship. the press is also do it part to whitewash extremism. the new york time call trump 's racism a reduct approach to ethnic and said trump 's attitud toward women is complex and defi simpl categor as if sexism is suddenli as complic as string theory. not everybod

#### Treebank tokenizer helps handle words like isn't --> is n't , trump's --> trump 's
#### Porter stemmer helps to reduce to root word based on some rules. reply --> repli , cats --> cat . It is slightly less aggressive than other stemming options 

In [17]:
df.head(2)

Unnamed: 0,category,authors,information,info_length,cleaned_info,processed_info
0,CRIME,Melissa Jeltsen,There Were 2 Mass Shootings In Texas Last Week...,27,there were mass shootings in texas last week b...,there were mass shoot in texa last week but on...
1,ENTERTAINMENT,Andy McDonald,Will Smith Joins Diplo And Nicky Jam For The 2...,20,will smith joins diplo and nicky jam for the w...,will smith join diplo and nicki jam for the wo...


In [18]:
df['inputs'] = df['processed_info'] + ' ' + df['authors']
df.head(2)

Unnamed: 0,category,authors,information,info_length,cleaned_info,processed_info,inputs
0,CRIME,Melissa Jeltsen,There Were 2 Mass Shootings In Texas Last Week...,27,there were mass shootings in texas last week b...,there were mass shoot in texa last week but on...,there were mass shoot in texa last week but on...
1,ENTERTAINMENT,Andy McDonald,Will Smith Joins Diplo And Nicky Jam For The 2...,20,will smith joins diplo and nicky jam for the w...,will smith join diplo and nicki jam for the wo...,will smith join diplo and nicki jam for the wo...


In [19]:
df.loc[0]['inputs']

'there were mass shoot in texa last week but onli on tv. she left her husband. he kill their children. just anoth day in america . Melissa Jeltsen'

In [0]:
import pickle
pickle.dump(df,open('/content/drive/My Drive/NewsClassifier/inputs','wb'))

## TRAINING

In [0]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(df['inputs'], df['category'], test_size=0.3)

In [0]:
X_train = np.array(X_train);
X_test = np.array(X_test);
Y_train = np.array(Y_train);
Y_test = np.array(Y_test);

In [53]:
X_train.shape

(137473,)

In [0]:
vectorizer = TfidfVectorizer(min_df=2,max_df=0.5) ###  choose tokens with appropriate document frequency
tfidf_train =vectorizer.fit_transform(X_train)

In [0]:
tfidf_test = vectorizer.transform(X_test)

In [28]:
tfidf_train

<137473x45166 sparse matrix of type '<class 'numpy.float64'>'
	with 3727377 stored elements in Compressed Sparse Row format>

### Logistic Regression: 73%

##### It is much faster than other algorithms like SVM. Linear SVM has more strict cost function than LR. It can provide more assurance. However, it has more time complexity 

In [0]:
from sklearn.linear_model import LogisticRegression
logistic_Regression = LogisticRegression()
logistic_Regression.fit(tfidf_train,Y_train)

In [0]:
predictions = logistic_Regression.predict(tfidf_test)

In [40]:
from sklearn.metrics import confusion_matrix,classification_report

print(confusion_matrix(Y_test,predictions))
print('\n')
print(classification_report(Y_test,predictions))

[[192  43   5 ...   4   0   1]
 [ 62 176   9 ...   3   1   0]
 [  2   1 655 ...  16   2   0]
 ...
 [  0   0  14 ... 434   3   2]
 [  0   1   2 ...   5 175  16]
 [  1   0   1 ...   4  11 189]]


                precision    recall  f1-score   support

          ARTS       0.59      0.45      0.51       423
ARTS & CULTURE       0.75      0.44      0.55       404
  BLACK VOICES       0.71      0.51      0.60      1273
      BUSINESS       0.61      0.60      0.61      1719
       COLLEGE       0.67      0.38      0.48       332
        COMEDY       0.70      0.63      0.66      1541
         CRIME       0.63      0.59      0.61       949
CULTURE & ARTS       0.80      0.41      0.54       327
       DIVORCE       0.87      0.72      0.78       990
     EDUCATION       0.67      0.45      0.53       303
 ENTERTAINMENT       0.70      0.85      0.77      4628
   ENVIRONMENT       0.74      0.27      0.40       404
         FIFTY       0.81      0.35      0.49       373
  FOOD & DRINK       