In [39]:
# this is our preamble cell :
# remember to check for anything missing 
import pandas as pd
from pandas import DataFrame
import numpy as np
import matplotlib 


import category_encoders as ce
from sklearn.model_selection import train_test_split


from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline


from sklearn import cluster
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.tree import plot_tree

In [80]:
# OK, importing and minor cleaning first. 

dfreal = pd.read_csv('True.csv',
                    parse_dates = ['date'])
#                    index_col = 'date')
dfreal['Fake'] = 0
print('Real data shape: ', dfreal.shape)

dffake = pd.read_csv('Fake.csv',
                    parse_dates = ['date'])
#                    index_col = 'date')

dffake['Fake'] = 1
print('Fake data shape: ', dffake.shape)

Real data shape:  (21417, 5)
Fake data shape:  (23481, 5)


In [81]:
# now I'll trim those up so they are the same length - 
# 50% real 50% fake seems reasonable right?

dfreal_trimmed = dfreal[-21_400 :]
print('Real trimmed shape: ', dfreal_trimmed.shape)

dffake_trimmed = dffake[-21_400 :]
print('Fake trimmed shape: ', dffake_trimmed.shape)

# and now combine them into one dataframe:
df_joined = dfreal_trimmed.append(dffake_trimmed, ignore_index=True)

#df_joined['date'] = pd.to_datetime(df_joined['date']).dt.date

print()
print('Combined and trimmed (equal parts Real and Fake) shape: ', df_joined.shape)
print()
#print(df_joined.head(1))
df_joined.head(25)

Real trimmed shape:  (21400, 5)
Fake trimmed shape:  (21400, 5)

Combined and trimmed (equal parts Real and Fake) shape:  (42800, 5)



Unnamed: 0,title,text,subject,date,Fake
0,"Trump on Twitter (Dec 26) - Hillary Clinton, T...",The following statements were posted to the ve...,politicsNews,2017-12-26 00:00:00,0
1,U.S. appeals court rejects challenge to Trump ...,(Reuters) - A U.S. appeals court in Washington...,politicsNews,2017-12-26 00:00:00,0
2,Treasury Secretary Mnuchin was sent gift-wrapp...,(Reuters) - A gift-wrapped package addressed t...,politicsNews,2017-12-24 00:00:00,0
3,Federal judge partially lifts Trump's latest r...,WASHINGTON (Reuters) - A federal judge in Seat...,politicsNews,2017-12-24 00:00:00,0
4,Exclusive: U.S. memo weakens guidelines for pr...,NEW YORK (Reuters) - The U.S. Justice Departme...,politicsNews,2017-12-23 00:00:00,0
5,Trump travel ban should not apply to people wi...,(Reuters) - A U.S. appeals court on Friday sai...,politicsNews,2017-12-23 00:00:00,0
6,Second court rejects Trump bid to stop transge...,WASHINGTON (Reuters) - A federal appeals court...,politicsNews,2017-12-23 00:00:00,0
7,Failed vote to oust president shakes up Peru's...,LIMA (Reuters) - Peru’s President Pedro Pablo ...,politicsNews,2017-12-23 00:00:00,0
8,"Trump signs tax, government spending bills int...",WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,2017-12-22 00:00:00,0
9,Companies have up to a year for new U.S. tax b...,WASHINGTON (Reuters) - U.S. financial regulato...,politicsNews,2017-12-23 00:00:00,0


In [82]:
# remember to write about this in your data cleaning. leave this cell.... for proof. 
df_joined['date'].value_counts()

2017-12-20 00:00:00                                                                                                                                      182
2017-12-06 00:00:00                                                                                                                                      166
2017-11-30 00:00:00                                                                                                                                      162
2017-11-09 00:00:00                                                                                                                                      158
2017-10-13 00:00:00                                                                                                                                      155
                                                                                                                                                        ... 
2016-01-24 00:00:00                                       

In [85]:

# we have to find the 2 values with wrong dates - one is a URL and one is just not a datetime. 
# they are the reason that our target vector and X are different lengths. 

#sub = 'https://100percentfedup.com/12-yr-old-black-conservative-whose-video-to-obama-went-viral-do-you-really-love-america-receives-death-threats-from-left/'
#df_joined['date'].str.find(sub) 

df_joined = df_joined.drop(df_joined.loc[df_joined['date'] == 'https://100percentfedup.com/12-yr-old-black-conservative-whose-video-to-obama-went-viral-do-you-really-love-america-receives-death-threats-from-left/'])

df_joined = df_joined.drop(df_joined.loc[df_joined['date'] == 'Jul 19, 2015'])

KeyError: "['title' 'text' 'subject' 'date' 'Fake'] not found in axis"

In [26]:
# X and y split

target = df_joined['Fake']

X = df_joined.drop(df_joined['Fake'])

y = target

print('X shape: ', X.shape)
print('y shape: ', y.shape)

X shape:  (42798, 5)
y shape:  (42800,)


In [27]:
# need to go lookup the params for train_test_split

X_train, y_train, X_test, y_test = train_test_split(X, y)


ValueError: Found input variables with inconsistent numbers of samples: [42798, 42800]

In [15]:
# this is silly but it's good form, so here's a baseline. 
print('Baseline :', df_joined['Fake'].value_counts().max()/len(df_joined['Fake']))

Baseline : 0.5


# I don't think we'll use this but here's a Count Vectorizer just in case:

In [19]:
# instantiate the count vectorizer:
cv = CountVectorizer(stop_words = 'english', strip_accents ='ascii', max_features = 50, max_df = 0.95 , min_df = 0.01 )

# here's a function to return a dataframe:

def create_term_matrix(message_list, vectorizer):
    doc_term_df = vectorizer.fit_transform(message_list)
    return DataFrame(doc_term_df.toarray(),
                     columns=vectorizer.get_feature_names())

# now here's the actual "thing":

df_joined_CountVector = create_term_matrix(df_joined['text'], cv)

df_joined_CountVector.describe()

Unnamed: 0,according,american,called,campaign,clinton,country,court,democratic,did,donald,...,support,time,told,trump,united,washington,week,white,year,years
count,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,...,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0
mean,0.31486,0.360047,0.295864,0.483551,0.617967,0.389182,0.329229,0.303621,0.369579,0.58972,...,0.297453,0.460584,0.529463,2.516986,0.536192,0.421916,0.299743,0.482874,0.557079,0.386706
std,0.787661,1.064934,0.659172,1.348366,2.304393,0.904831,1.352565,0.888218,0.797904,1.019764,...,0.767365,0.909996,0.939871,4.381324,1.283519,0.954844,0.675265,1.305999,1.120036,0.89086
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,1.0,3.0,1.0,1.0,0.0,0.0,1.0,1.0
max,20.0,48.0,10.0,31.0,67.0,26.0,43.0,18.0,20.0,32.0,...,24.0,28.0,13.0,73.0,42.0,35.0,10.0,36.0,27.0,61.0


In [None]:
# after reading today, it would appear that the results of the Tf-IDF Vectorizer are the same results are doing a Count Vectorizer followed by a Tf-IDF Transform. 
# Everyone I spoke to from lambda seemed to think the only model I should run here is probably the GradientBoostingClassifier. 
# I've been reading documentation on how to adjust TF-IDF Vectorizer.
# I still have questions regarding how to merge the results from this with my orginal df_joined(41,800 x 5) - so pin in that for now. 

# finally got git bash working with my github repo - big shoutout to Jacob Maxfield and Josh Carlisle. 
# getting a bit confused by pipenv install not working with the nltk package - don't need it for this though, maybe anaconda later... 
# create a dictionary for myself (ex: year and years same word) - (sisichen)
# find an NLP function that can cluster similiar words together - lookup common NLP functions - (sisichen)
# merging original frame and target vector with results so that I can train test split and fit model...
# Ngrams. need to grok that. sequences of words, but how many and in what order??

In [29]:
# alright, let's see if I can generate anything to show for myself re: TF-IDF Vectorizer... 

# instantiate:
tfidf = TfidfVectorizer(stop_words = 'english', strip_accents ='ascii', max_features = 100, min_df= 0.25 , max_df= 0.75)
# ngram_range=(1,2) - we're gonna play with the ngrams soon, be patient. 
def create_term_matrix(message_list, vectorizer):
    doc_term_df = vectorizer.fit_transform(message_list)
    return DataFrame(doc_term_df.toarray(),
                     columns=vectorizer.get_feature_names())

df_joined_tfidfvector = create_term_matrix(df_joined['text'], tfidf)

df_joined_tfidfvector.describe()

Unnamed: 0,did,donald,government,house,just,like,new,news,people,president,...,said,state,states,time,told,trump,united,washington,year,years
count,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,...,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0
mean,0.057976,0.074466,0.09503,0.08585,0.081363,0.077764,0.093913,0.076367,0.120285,0.138979,...,0.284218,0.105809,0.081443,0.070904,0.080791,0.240466,0.075833,0.068417,0.084779,0.061547
std,0.124791,0.119814,0.19017,0.17762,0.157345,0.15873,0.168262,0.168511,0.19265,0.183979,...,0.254732,0.197131,0.1509,0.139149,0.142332,0.313201,0.152335,0.136781,0.161208,0.133028
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.065588,...,0.245756,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.052583,0.12262,0.113733,0.09145,0.114226,0.098738,0.140378,0.056638,0.186211,0.230136,...,0.470876,0.144329,0.122476,0.103186,0.130166,0.491378,0.087378,0.088731,0.121411,0.039868
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.980872,1.0,1.0,1.0


In [31]:
df_joined_tfidfvector.head()

Unnamed: 0,did,donald,government,house,just,like,new,news,people,president,...,said,state,states,time,told,trump,united,washington,year,years
0,0.0,0.190417,0.0,0.0,0.0,0.0,0.201728,0.0,0.0,0.163035,...,0.0,0.0,0.0,0.43239,0.0,0.524292,0.0,0.0,0.638001,0.0
1,0.151817,0.12332,0.143931,0.0,0.0,0.142834,0.130646,0.0,0.119388,0.211174,...,0.250203,0.271883,0.697557,0.0,0.0,0.113183,0.147631,0.291535,0.0,0.0
2,0.0,0.222648,0.0,0.0,0.0,0.0,0.235874,0.0,0.0,0.190631,...,0.150576,0.0,0.0,0.0,0.719675,0.40869,0.0,0.0,0.0,0.0
3,0.0,0.119469,0.139435,0.0,0.0,0.0,0.126565,0.0,0.115659,0.204578,...,0.323185,0.131696,0.405462,0.0,0.0,0.548239,0.429059,0.141215,0.133428,0.14765
4,0.0,0.064417,0.075183,0.0,0.071857,0.0,0.818922,0.0,0.0,0.055154,...,0.479215,0.0,0.072874,0.219412,0.0,0.118243,0.077116,0.0,0.071944,0.0


In [34]:
df_joined_tfidfvector.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42800 entries, 0 to 42799
Data columns (total 22 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   did         42800 non-null  float64
 1   donald      42800 non-null  float64
 2   government  42800 non-null  float64
 3   house       42800 non-null  float64
 4   just        42800 non-null  float64
 5   like        42800 non-null  float64
 6   new         42800 non-null  float64
 7   news        42800 non-null  float64
 8   people      42800 non-null  float64
 9   president   42800 non-null  float64
 10  republican  42800 non-null  float64
 11  reuters     42800 non-null  float64
 12  said        42800 non-null  float64
 13  state       42800 non-null  float64
 14  states      42800 non-null  float64
 15  time        42800 non-null  float64
 16  told        42800 non-null  float64
 17  trump       42800 non-null  float64
 18  united      42800 non-null  float64
 19  washington  42800 non-nul

In [None]:
# # Create pipeline
# model = Pipeline([
#     ('vectorizer', TfidfVectorizer(lowercase=True, ngram_range=(1,1))),
#     ('dim_red', TruncatedSVD(n_components=50, random_state=42)),
#     ('predictor', GradientBoostingClassifier(random_state=42))
# ])

# # Fit model to training data
# model.fit(X_train, y_train); 

# REMEMBER TO GET A VALIDATION SPLIT AND CHECK IT AGAINST THAT. 