In [104]:
# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import re
import string

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression

stopwords = stopwords.words('english')

In [105]:
train = pd.read_csv('no_stopwords_combined.csv')
test = pd.read_csv('testnostopwords.csv')

In [106]:
train.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,title,text,subject,date,target
0,0,0,Ramaphosa's ANC election win lifts South Afric...,johannesburg reuters south african banking sto...,worldnews,"December 19, 2017",1
1,1,1,VIGILANTE PIRATES INTERCEDE Where Government F...,like soldiers oden vigilante group reported fe...,Government News,"Apr 1, 2016",0
2,2,2,"SICK! DEMOCRAT ORGANIZER, Mayor DeBlasio Emplo...",last week huma abedin husband anthony weiner w...,left-news,"May 28, 2017",0
3,3,3,Dennis Rodman talks of skiing friendship with ...,edinburgh reuters us basketball legend dennis ...,worldnews,"September 6, 2017",1
4,4,4,New Hampshire Cops Intent On Protecting Sadis...,lynchings black people still happen jim crow c...,News,"September 10, 2017",0


In [107]:
test.head()

Unnamed: 0.1,Unnamed: 0,article_title,text,location,target
0,0,Syria attack symptoms consistent with nerve ag...,wed apr syria attack symptoms consistent nerve...,idlib,0
1,1,Homs governor says U.S. attack caused deaths b...,fri apr homs governor says us attack caused de...,homs,0
2,2,Death toll from Aleppo bomb attack at least 112,sun apr death toll aleppo bomb attack least de...,aleppo,0
3,3,Aleppo bomb blast kills six Syrian state TV,wed apr aleppo bomb blast kills six syrian sta...,aleppo,0
4,4,29 Syria Rebels Dead in Fighting for Key Alepp...,sun jul syria rebels dead fighting key aleppo ...,aleppo,0


In [108]:
'''
def clean_text(text):
    
    text = text.lower()
    text = re.sub('[\'\[\],]', '', text)

    return text

test['text'] = test['text'].apply(lambda x : clean_text(x))
train['text'] = train['text'].apply(lambda x : clean_text(x))
'''

"\ndef clean_text(text):\n    \n    text = text.lower()\n    text = re.sub('['\\[\\],]', '', text)\n\n    return text\n\ntest['text'] = test['text'].apply(lambda x : clean_text(x))\ntrain['text'] = train['text'].apply(lambda x : clean_text(x))\n"

In [109]:
test.head()

Unnamed: 0.1,Unnamed: 0,article_title,text,location,target
0,0,Syria attack symptoms consistent with nerve ag...,wed apr syria attack symptoms consistent nerve...,idlib,0
1,1,Homs governor says U.S. attack caused deaths b...,fri apr homs governor says us attack caused de...,homs,0
2,2,Death toll from Aleppo bomb attack at least 112,sun apr death toll aleppo bomb attack least de...,aleppo,0
3,3,Aleppo bomb blast kills six Syrian state TV,wed apr aleppo bomb blast kills six syrian sta...,aleppo,0
4,4,29 Syria Rebels Dead in Fighting for Key Alepp...,sun jul syria rebels dead fighting key aleppo ...,aleppo,0


In [110]:
train.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,title,text,subject,date,target
0,0,0,Ramaphosa's ANC election win lifts South Afric...,johannesburg reuters south african banking sto...,worldnews,"December 19, 2017",1
1,1,1,VIGILANTE PIRATES INTERCEDE Where Government F...,like soldiers oden vigilante group reported fe...,Government News,"Apr 1, 2016",0
2,2,2,"SICK! DEMOCRAT ORGANIZER, Mayor DeBlasio Emplo...",last week huma abedin husband anthony weiner w...,left-news,"May 28, 2017",0
3,3,3,Dennis Rodman talks of skiing friendship with ...,edinburgh reuters us basketball legend dennis ...,worldnews,"September 6, 2017",1
4,4,4,New Hampshire Cops Intent On Protecting Sadis...,lynchings black people still happen jim crow c...,News,"September 10, 2017",0


In [111]:
train_raw = train
test_raw = test

train_copy = train.copy(deep = True)
test_copy = test.copy(deep = True)

In [112]:
train_copy.dropna(subset = ['text'],inplace = True)
test_copy.dropna(subset = ['text'],inplace = True)

### Raw

In [113]:
count_vectorizer = CountVectorizer()

train_vectors = count_vectorizer.fit_transform(train_copy['text'])
test_vectors = count_vectorizer.transform(test_copy['text'])

train_vectors.shape, test_vectors.shape

((44180, 199319), (804, 199319))

### Min_DF and Max_DF parameter

In [114]:
count_vectorizer = CountVectorizer(min_df = 2,max_df = 0.8)

train_vectors = count_vectorizer.fit_transform(train_copy['text'])
test_vectors = count_vectorizer.transform(test_copy['text'])

train_vectors.shape, test_vectors.shape

((44180, 107650), (804, 107650))

### N-grams

In [115]:
#No text preprocessing req. REGEX and Stopwords are already taken care of

count_vectorizer = CountVectorizer(list(train_copy['text']), preprocessor = None,ngram_range = (1,2))

train_vectors = count_vectorizer.fit_transform(train_copy['text'])
test_vectors = count_vectorizer.transform(test_copy['text'])

train_vectors.shape, test_vectors.shape

((44180, 4048261), (804, 4048261))

In [116]:
list(count_vectorizer.vocabulary_)[:10]

['johannesburg',
 'reuters',
 'south',
 'african',
 'banking',
 'stocks',
 'rallied',
 'tuesday',
 'buoyed',
 'optimism']

In [117]:
# strict bigrams

count_vectorizer = CountVectorizer(list(train_copy['text']),
                                   preprocessor=None,
                                   ngram_range=(2,2))

train_vectors = count_vectorizer.fit_transform(train_copy['text'])
test_vectors = count_vectorizer.transform(test_copy['text'])

train_vectors.shape, test_vectors.shape

((44180, 3848942), (804, 3848942))

In [118]:
list(count_vectorizer.vocabulary_)[:10]

['johannesburg reuters',
 'reuters south',
 'south african',
 'african banking',
 'banking stocks',
 'stocks rallied',
 'rallied tuesday',
 'tuesday buoyed',
 'buoyed optimism',
 'optimism newly']

# New Trial Baseline

In [119]:
count_vectorizer = CountVectorizer(ngram_range=(1,2))

train_vectors = count_vectorizer.fit_transform(train_copy['text'])
test_vectors = count_vectorizer.transform(test_copy['text'])

X_train = train_vectors
y_train = train_copy['target']

In [120]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=1.0)
lr_scores = model_selection.cross_val_score(clf, X_train, y_train, cv=5, scoring="f1")

print(lr_scores)
print(lr_scores.sum()/5)

[0.9964986  0.99673279 0.99766464 0.9959136  0.99766573]
0.9968950731512318


### Testing on external dummy test data

In [121]:
clf.fit(train_vectors, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [122]:
from sklearn.metrics import f1_score, accuracy_score

y_test = test_copy['target']
preds = clf.predict(test_vectors)

print(f1_score(y_test,preds))

0.5024038461538461


# TF-IDF

In [123]:
# word level
tfidf = TfidfVectorizer(analyzer='word',
                        token_pattern=r'\w{1,}',
                        max_features=5000)

train_tfidf = tfidf.fit_transform(train_copy['text'])
test_tfidf = tfidf.transform(test_copy["text"])

In [124]:
#ngram level
tfidf = TfidfVectorizer(analyzer='word',
                        ngram_range=(2,3),
                        token_pattern=r'\w{1,}',
                        max_features=5000)

train_tfidf = tfidf.fit_transform(train_copy['text'])
test_tfidf = tfidf.transform(test_copy["text"])

In [125]:
# characters level
tfidf = TfidfVectorizer(analyzer='char',
                        ngram_range=(2,3),
                        token_pattern=r'\w{1,}',
                        max_features=5000)

train_tfidf = tfidf.fit_transform(train_copy['text'])
test_tfidf = tfidf.transform(test_copy["text"])

# Baseline with TF-IDF

In [126]:
train_tfidf = tfidf.fit_transform(train_copy['text'])
test_tfidf = tfidf.transform(test_copy["text"])

In [127]:
clf = LogisticRegression(C=1.0)
lr_scores = model_selection.cross_val_score(clf, 
                                            train_tfidf, 
                                            train_copy["target"], 
                                            cv=5, scoring="f1")

print(lr_scores)
print(lr_scores.sum()/5)

[0.97863896 0.98068869 0.97995805 0.97976744 0.97853081]
0.9795167921326756


In [128]:
clf.fit(train_tfidf, train_copy["target"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [129]:
preds_tfidf = clf.predict(test_tfidf)

print(f1_score(y_test,preds_tfidf))

0.6172344689378757
