In [15]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
import re
import string

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

#sklearn
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV,StratifiedKFold,RandomizedSearchCV

import matplotlib.pyplot as plt
import seaborn as sns

import os

import warnings
warnings.filterwarnings('ignore')

In [21]:
train = pd.read_csv('Datasets\Single Source\Treated\combined_train_treated.csv')
test = pd.read_csv('Datasets\Single Source\Treated\combined_test_treated.csv')

In [17]:
train_copy = train.copy(deep = True)
test_copy = test.copy(deep = True)

In [18]:
train_copy.head()

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,target,publisher
0,0,GUESS WHO PAID FOR FAMILY OF GOV DEPENDENT MUS...,guess paid family gov dependent muslim terrori...,politics,"Apr 24, 2015",0,
1,1,Defector says thousands of Islamic State fight...,defector says thousands islamic state fighters...,worldnews,"December 7, 2017",1,ANKARA (Reuters)
2,2,Thai king's remains laid to rest at end of fiv...,thai kings remains laid rest end fiveday cerem...,worldnews,"October 29, 2017",1,BANGKOK (Reuters)
3,3,"'I am sorry,' British PM May says of botched e...",sorry british pm may says botched election bri...,worldnews,"October 1, 2017",1,"MANCHESTER, England (Reuters)"
4,4,It’s Bad Enough Canadian Magazine Puts A Smili...,bad enough canadian magazine puts smiling terr...,politics,"Nov 7, 2015",0,


In [19]:
test_copy.head()

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,target,publisher
0,0,Boiler Room EP #69 – CULTure Club,boiler room ep culture club tune alternate cur...,US_News,"August 18, 2016",0,
1,1,WOW! BLACK LIVES MATTER MOB Celebrating NYC Po...,wow black lives matter mob celebrating nyc pol...,left-news,"Aug 2, 2016",0,
2,2,Trump visit to Britain still unfixed nine mont...,trump visit britain still unfixed nine months ...,worldnews,"September 8, 2017",1,LONDON (Reuters)
3,3,Henningsen: Obama White House Colluded with Fa...,henningsen obama white house colluded facebook...,US_News,"October 6, 2017",0,
4,4,UN OFFICIAL TIED TO CLINTONS Set To Face Trial...,un official tied clintons set face trial found...,left-news,"Jun 26, 2016",0,


In [34]:
train_copy.isnull().sum()

Unnamed: 0        0
title             0
text              0
subject           0
date              0
target            0
publisher     20000
dtype: int64

In [35]:
test_copy.isnull().sum()

Unnamed: 0       0
title            0
text             0
subject          0
date             0
target           0
publisher     3481
dtype: int64

In [33]:
train_copy['text'].fillna('No Text', inplace = True)
test_copy['text'].fillna('No Text', inplace = True)

## Bag of Words

In [40]:
count_vectorizer = CountVectorizer()
train_vectors = count_vectorizer.fit_transform(train_copy['text'])

### TF-IDF Features

In [41]:
tfidf = TfidfVectorizer(min_df = 2, max_df = 0.5, ngram_range =(1,2))
train_tfidf = tfidf.fit_transform(train_copy['text'])

# Text Classifier

In [42]:
clf = LogisticRegression(C = 1.0)
scores = model_selection.cross_val_score(clf, train_vectors, 
                                         train_copy['target'],
                                         cv = 5, scoring = 'f1')
print(scores)
print(sum(scores)/5)

[0.98341853 0.98436523 0.98626717 0.98758309 0.98436914]
0.985200630408795


In [43]:
clf_tfidf = LogisticRegression(C = 1.0)
scores_tfidf = model_selection.cross_val_score(clf_tfidf, train_tfidf, 
                                         train['target'],
                                        cv = 5, scoring = 'f1')
print(scores_tfidf)
print(sum(scores)/5)

[0.97840469 0.97722723 0.97908887 0.98036764 0.97856874]
0.985200630408795


### On test

In [45]:
test_vectors = count_vectorizer.transform(test_copy['text'])

In [46]:
test_tfidf = tfidf.transform(test_copy['text'])

In [47]:
#Fit using whole train samples

clf.fit(train_vectors, train_copy['target'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

#### Without TFIDF

In [48]:
y_test = test_copy['target']
preds = clf.predict(test_vectors)

print(f1_score(y_test,preds))

0.9284525790349418


#### TFIDF Result

In [50]:
clf_tfidf.fit(train_tfidf,train_copy['target'])

pred_tfidf = clf_tfidf.predict(test_tfidf)
print(f1_score(test_copy['target'],pred_tfidf))

0.9036845507433744
