# Project 

In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import make_scorer, confusion_matrix
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

# The following line is needed to show plots inline in notebooks
%matplotlib inline 

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,claim,claimant,date,id,label,related_articles
0,0,A line from George Orwell's novel 1984 predict...,,17/07/2017,0,0,"[122094, 122580, 130685, 134765]"
1,1,Maine legislature candidate Leslie Gibson insu...,,17/03/2018,1,2,"[106868, 127320, 128060]"
2,2,A 17-year-old girl named Alyssa Carson is bein...,,18/07/2018,4,1,"[132130, 132132, 149722]"
3,3,In 1988 author Roald Dahl penned an open lette...,,04/02/2019,5,2,"[123254, 123418, 127464]"
4,4,"When it comes to fighting terrorism, ""Another ...",Hillary Clinton,22/03/2016,6,2,"[41099, 89899, 72543, 82644, 95344, 88361]"


In [4]:
data.shape

(15555, 7)

## Data Cleaning

### Claimant

In [5]:
# There are 4962 missing values in claimant
data[data['claimant'].isna()].shape[0]

4962

In [6]:
claimant_count = data['claimant'].value_counts()
claimant_count

Donald Trump                                          1233
Bloggers                                               372
Barack Obama                                           234
Hillary Clinton                                        220
Viral image                                            127
Facebook posts                                         108
Ted Cruz                                               106
Various websites                                       106
Bernie Sanders                                         101
Marco Rubio                                             97
Scott Walker                                            90
John McCain                                             88
Rick Perry                                              77
Rick Scott                                              73
Chain email                                             71
Facebook user                                           71
multiple sources                                        

In [7]:
# Group together all counts less than 100 in to Others
value_mask = data.claimant.isin(claimant_count.index[claimant_count < 100]) 
data.loc[value_mask,'claimant'] = "Other"
data.claimant.value_counts()

Other               7986
Donald Trump        1233
Bloggers             372
Barack Obama         234
Hillary Clinton      220
Viral image          127
Facebook posts       108
Various websites     106
Ted Cruz             106
Bernie Sanders       101
Name: claimant, dtype: int64

In [8]:
# Of the claimant missing, how many are false-0, partly true-1, true-2?
missing_data = data[data['claimant'].isna()]
missing_data.label.value_counts(normalize=True)
# roughly 87% of claims with missing claimants are not completely true

0    0.611447
1    0.259371
2    0.129182
Name: label, dtype: float64

### Date

In [9]:
# 5% of data happen on 2 separate days, November 6th 2017 and October 23rd 2016
date_count = data['date'].value_counts()
date_count

06/11/2017    407
23/10/2016    339
07/07/2016     97
26/09/2018     59
04/10/2016     35
25/02/2019     33
30/01/2019     31
07/03/2019     30
09/10/2016     29
20/02/2019     29
19/10/2016     29
22/10/2018     28
30/01/2018     27
07/02/2019     26
14/09/2018     26
12/12/2018     25
08/05/2018     25
05/02/2019     25
10/04/2019     25
14/02/2019     25
13/03/2018     24
04/04/2019     24
26/10/2018     24
25/09/2018     23
12/03/2019     23
07/09/2016     23
11/02/2019     23
23/08/2017     23
27/07/2016     23
10/10/2018     23
             ... 
14/12/2011      1
01/11/2011      1
12/09/2012      1
21/12/2012      1
20/07/2011      1
04/05/2012      1
17/06/2010      1
18/04/2010      1
06/02/2010      1
25/02/2017      1
11/09/2016      1
08/07/2009      1
28/05/2015      1
21/12/2009      1
06/05/2010      1
25/04/2011      1
31/08/2013      1
28/09/2013      1
06/02/2015      1
25/05/2011      1
27/05/2018      1
26/01/2011      1
18/05/2013      1
06/07/2008      1
01/08/2015

In [10]:
# On average, about 5 claims happen a day
data.date.value_counts().mean()

5.152368333885392

In [11]:
# Group together all counts less than average in to Others
value_mask = data.date.isin(date_count.index[date_count < 30]) 
data.loc[value_mask,'date'] = "Other"
data.date.value_counts()

Other         14524
06/11/2017      407
23/10/2016      339
07/07/2016       97
26/09/2018       59
04/10/2016       35
25/02/2019       33
30/01/2019       31
07/03/2019       30
Name: date, dtype: int64

## Encoding

### Claimant Encode

In [12]:
# Label encoding
data['claimant']=data['claimant'].astype('category')
data['claimant_cat']=data['claimant'].cat.codes
claimant_labels = dict(enumerate(data['claimant'].cat.categories))
claimant_labels

{0: 'Barack Obama',
 1: 'Bernie Sanders',
 2: 'Bloggers',
 3: 'Donald Trump',
 4: 'Facebook posts',
 5: 'Hillary Clinton',
 6: 'Other',
 7: 'Ted Cruz',
 8: 'Various websites',
 9: 'Viral image'}

### Claim Encode

#### CountVectorizer 

In [13]:
vectorizer = CountVectorizer(analyzer = "word", # the feature should be made of word
                             tokenizer = None, 
                             preprocessor = None, 
                             stop_words = 'english', # Remove stop words such as “the”, “a”, etc.
                             max_features = 500)
claim_vec = vectorizer.fit_transform(data['claim'].values.astype('U'))

In [14]:
claim_array = claim_vec.toarray()

In [24]:
vectorizer.get_feature_names()

['000',
 '10',
 '100',
 '11',
 '12',
 '13',
 '14',
 '15',
 '20',
 '200',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '25',
 '30',
 '300',
 '40',
 '50',
 '500',
 '60',
 '70',
 '80',
 'able',
 'abortion',
 'according',
 'act',
 'actually',
 'administration',
 'africa',
 'african',
 'age',
 'ago',
 'air',
 'allow',
 'allowed',
 'america',
 'american',
 'americans',
 'announced',
 'anti',
 'april',
 'arrested',
 'article',
 'asked',
 'attack',
 'attacks',
 'august',
 'average',
 'away',
 'baby',
 'ban',
 'banned',
 'barack',
 'believe',
 'benefits',
 'bernie',
 'best',
 'better',
 'big',
 'billion',
 'birth',
 'black',
 'border',
 'born',
 'budget',
 'bush',
 'business',
 'buy',
 'california',
 'called',
 'came',
 'campaign',
 'canada',
 'cancer',
 'candidate',
 'car',
 'care',
 'carolina',
 'case',
 'caught',
 'cause',
 'change',
 'check',
 'child',
 'children',
 'china',
 'cities',
 'citizens',
 'city',
 'claimed',
 'claims',
 'class',
 'climate',
 'clinton',

In [26]:
claim_array

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [27]:
claim_array[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

#### TFID

In [15]:
# tf_vectorizer = TfidfVectorizer(max_features = 500)
# tf_vec = tf_vectorizer.fit_transform(data['claim'].values.astype('U'))
#print(tf_vectorizer.get_feature_names())

In [16]:
# tf_claim_array = tf_vec.toarray()

### Date Encode

In [17]:
# Label encoding
data['date']=data['date'].astype('category')
data['date_cat']=data['date'].cat.codes
date_labels = dict(enumerate(data['date'].cat.categories))
date_labels

{0: '04/10/2016',
 1: '06/11/2017',
 2: '07/03/2019',
 3: '07/07/2016',
 4: '23/10/2016',
 5: '25/02/2019',
 6: '26/09/2018',
 7: '30/01/2019',
 8: 'Other'}

### Cleaned Data Frame

In [18]:
features = data.drop(['Unnamed: 0', 'claim', 'id', 'related_articles','claimant', 'date'], axis=1)
claim_features = pd.DataFrame(data=claim_array, columns = vectorizer.get_feature_names())
cleaned_features = pd.concat([features, claim_features], axis=1)

##### split the test and train data

In [19]:
# Split test and train data
X = cleaned_features.drop(['label'], axis=1)
y = cleaned_features['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

##### Logistic Regression

In [20]:
model = LogisticRegression()
model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
predictions = model.predict(X_test)
    
TN = confusion_matrix(y_test, predictions)[0][0]
FP = confusion_matrix(y_test, predictions)[0][1]
FN = confusion_matrix(y_test, predictions)[1][0]
TP = confusion_matrix(y_test, predictions)[1][1]
total = TN + FP + FN + TP
ACC = (TP + TN) / float(total)

print ("This model got an accuracy of {}% on the testing set".format(round(ACC*100,2)))   

This model got an accuracy of 64.79% on the testing set


In [22]:
model.score(X_test, y_test)

0.5746732376258838

In [23]:
confusion_matrix(y_test, predictions)

array([[1574,  633,   21],
       [ 821, 1101,   12],
       [ 287,  211,    7]])