In [17]:
# this is our preamble cell :
# remember to check for anything missing 
import pandas as pd
from pandas import DataFrame
import numpy as np
import matplotlib 


import category_encoders as ce
from sklearn.model_selection import train_test_split


from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline


from sklearn import cluster
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.tree import plot_tree

In [57]:
# OK, importing and minor cleaning first. 

dfreal = pd.read_csv('True.csv',
                    parse_dates = ['date'])
#                    index_col = 'date')
dfreal['Fake'] = 0
print('Real data shape: ', dfreal.shape)

dffake = pd.read_csv('Fake.csv',
                    parse_dates = ['date'])
#                    index_col = 'date')

dffake['Fake'] = 1
print('Fake data shape: ', dffake.shape)

Real data shape:  (21417, 5)
Fake data shape:  (23481, 5)


In [58]:
# this is proof that only the fake csv has garbage dates in it, which is why none of the parse dates worked. 
dfreal['date'].value_counts()
#.tail(50)

2017-12-20    182
2017-12-06    166
2017-11-30    162
2017-11-09    158
2017-10-13    155
             ... 
2016-09-11      1
2016-05-14      1
2016-05-30      1
2016-08-06      1
2016-09-03      1
Name: date, Length: 716, dtype: int64

In [59]:
dffake['date'].value_counts()

May 10, 2017                                                                                                                                             46
May 5, 2016                                                                                                                                              44
May 26, 2016                                                                                                                                             44
May 6, 2016                                                                                                                                              44
May 11, 2016                                                                                                                                             43
                                                                                                                                                         ..
November 20, 2017                                               

In [84]:

# dffake = 
comma_list = pd.DataFrame(dffake['date'].str.find(','))

comma_list.value_counts()
comma_list.shape

(23481, 1)

In [108]:
# df = df[~df['your column'].isin(['list of strings'])]
# http = ['http',".com"]
# dffake2 = dffake[~dffake.date.isin(http)]

searchfor = ['http', '-', 'MSNBC']
dffake2 = dffake[~dffake['date'].str.contains('|'.join(searchfor))]

In [109]:
dffake2['date'].value_counts()

May 10, 2017         46
May 26, 2016         44
May 6, 2016          44
May 5, 2016          44
May 11, 2016         43
                     ..
October 9, 2017       1
December 19, 2017     1
November 19, 2017     1
October 22, 2017      1
December 9, 2017      1
Name: date, Length: 1669, dtype: int64

In [110]:
print(dffake2.shape)
print(dfreal.shape)

(23436, 5)
(21417, 5)


In [105]:
#dffake2['date'] = pd.to_datetime(dffake2['date'], format='%m%d%y')
# dffake2['date'] = dffake2['date'].astype('datetime64[ns]')

In [116]:
# now I'll trim those up so they are the same length - 
# 50% real 50% fake seems reasonable right?

dfreal_trimmed = dfreal[-21_400 :]
print('Real trimmed shape: ', dfreal_trimmed.shape)

dffake_trimmed = dffake2[-21_400 :]
print('Fake trimmed shape: ', dffake_trimmed.shape)

# and now combine them into one dataframe:
df_joined = dfreal_trimmed.append(dffake_trimmed, ignore_index=True)

df_joined['date'] = pd.to_datetime(df_joined['date'])

print()
print('Combined and trimmed (equal parts Real and Fake) shape: ', df_joined.shape)
print()
#print(df_joined.head(1))
#df_joined.head(25)

Real trimmed shape:  (21400, 5)
Fake trimmed shape:  (21400, 5)

Combined and trimmed (equal parts Real and Fake) shape:  (42800, 5)



In [119]:
df_joined['date'].value_counts()

# Holy Smokes I think all the date times are clean. 
# Let's never spend 2 days on that again. 

2017-12-20    194
2017-12-06    180
2017-11-09    178
2017-11-30    175
2017-10-13    171
             ... 
2015-06-21      1
2015-06-07      1
2015-07-19      1
2015-07-18      1
2015-04-02      1
Name: date, Length: 1004, dtype: int64

In [12]:

# All URL's and Wrong-Dates seem to be corrected. Finally.  
# X and y are still not the same length - need to fix that... 



In [134]:
# X and y split

target = df_joined['Fake']

X = df_joined.drop(['Fake'], axis=1)

y = target

print('X shape: ', X.shape)
print('y shape: ', y.shape)

X shape:  (42800, 4)
y shape:  (42800,)


In [15]:
# this is silly but it's good form, so here's a baseline. 
print('Baseline :', df_joined['Fake'].value_counts().max()/len(df_joined['Fake']))

Baseline : 0.5


In [141]:
# train test split:

# leaving this code here in case I set up my target wrong. 
# X = df.drop(['target'],axis=1).values   # independant features
# y = df['target'].values					# dependant variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
# doing 25/75 split and 42. 

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42) # 0.25 x 0.8 = 0.2

In [142]:
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)
print(X_test.shape)
print(y_test.shape)

(24075, 4)
(24075,)
(8025, 4)
(8025,)
(10700, 4)
(10700,)


In [None]:
# Wonderful. All variables should be set up the right way... but I will still check in with someone about whether I did this right. 

## *I don't think we'll use this but here's a Count Vectorizer just in case:*

In [122]:
# instantiate the count vectorizer:
cv = CountVectorizer(stop_words = 'english', strip_accents ='ascii', max_features = 50, max_df = 0.95 , min_df = 0.01 )

# here's a function to return a dataframe:

def create_term_matrix(message_list, vectorizer):
    doc_term_df = vectorizer.fit_transform(message_list)
    return DataFrame(doc_term_df.toarray(),
                     columns=vectorizer.get_feature_names())

# now here's the actual "thing":

df_joined_CountVector = create_term_matrix(df_joined['text'], cv)

df_joined_CountVector.describe()

Unnamed: 0,according,american,called,campaign,clinton,country,court,democratic,did,donald,...,support,time,told,trump,united,washington,week,white,year,years
count,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,...,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0
mean,0.315047,0.360117,0.295794,0.483505,0.618014,0.389159,0.329299,0.303575,0.369673,0.590771,...,0.297734,0.460935,0.529416,2.523014,0.536028,0.422009,0.299813,0.483178,0.557126,0.386589
std,0.787854,1.065031,0.658973,1.347394,2.304309,0.904674,1.353023,0.888128,0.797977,1.02093,...,0.768017,0.910293,0.939773,4.386813,1.283105,0.954925,0.67532,1.305252,1.120097,0.890714
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,1.0,3.0,1.0,1.0,0.0,0.0,1.0,1.0
max,20.0,48.0,10.0,31.0,67.0,26.0,43.0,18.0,20.0,32.0,...,24.0,28.0,13.0,73.0,42.0,35.0,10.0,36.0,27.0,61.0


In [None]:
# Tf-IDF Vectorizer are the same results are doing a Count Vectorizer followed by a Tf-IDF Transform. 
# GradientBoostingClassifier. Good thing we learned some params today. 
# 
# I still have questions regarding how to merge the results from this with my orginal df_joined(41,800 x 5) - so pin in that for now. 

# 
# create a dictionary for myself (ex: year and years same word) - (sisichen)
# find an NLP function that can cluster similiar words together - lookup common NLP functions - (sisichen)

# merging original frame and target vector with results so that I can train test split and fit model...
# Ngrams. They exist in the parameters for the vectorizer / model below... 

In [123]:
# alright, let's see if I can generate anything to show for myself re: TF-IDF Vectorizer... 

# instantiate:
tfidf = TfidfVectorizer(stop_words = 'english', strip_accents ='ascii', max_features = 100, min_df= 0.25 , max_df= 0.75)
# ngram_range=(1,2) - we're gonna play with the ngrams soon, be patient. 
def create_term_matrix(message_list, vectorizer):
    doc_term_df = vectorizer.fit_transform(message_list)
    return DataFrame(doc_term_df.toarray(),
                     columns=vectorizer.get_feature_names())

df_joined_tfidfvector = create_term_matrix(df_joined['text'], tfidf)

df_joined_tfidfvector.describe()

Unnamed: 0,did,donald,government,house,just,like,new,news,people,president,...,said,state,states,time,told,trump,united,washington,year,years
count,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,...,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0,42800.0
mean,0.057967,0.074543,0.095066,0.085921,0.081424,0.077734,0.093882,0.076336,0.120271,0.139008,...,0.284245,0.105763,0.081423,0.070897,0.080758,0.240842,0.075819,0.068428,0.084785,0.061514
std,0.124761,0.119868,0.190222,0.177617,0.157294,0.158628,0.168228,0.168455,0.192616,0.183971,...,0.254718,0.197059,0.150857,0.139093,0.142268,0.313414,0.152301,0.136795,0.161222,0.132984
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.065831,...,0.245803,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.052837,0.122746,0.113759,0.09176,0.11447,0.098803,0.140322,0.056671,0.186142,0.230169,...,0.470852,0.144302,0.122473,0.103245,0.130172,0.492389,0.087407,0.088721,0.121459,0.039246
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.980876,1.0,1.0,1.0


In [124]:
df_joined_tfidfvector.head()

Unnamed: 0,did,donald,government,house,just,like,new,news,people,president,...,said,state,states,time,told,trump,united,washington,year,years
0,0.0,0.190346,0.0,0.0,0.0,0.0,0.201783,0.0,0.0,0.16303,...,0.0,0.0,0.0,0.432356,0.0,0.524103,0.0,0.0,0.638174,0.0
1,0.15181,0.123258,0.143941,0.0,0.0,0.142809,0.130664,0.0,0.119369,0.211139,...,0.250194,0.27194,0.697583,0.0,0.0,0.113127,0.14763,0.291545,0.0,0.0
2,0.0,0.222543,0.0,0.0,0.0,0.0,0.235915,0.0,0.0,0.190607,...,0.150575,0.0,0.0,0.0,0.719777,0.408504,0.0,0.0,0.0,0.0
3,0.0,0.119426,0.139466,0.0,0.0,0.0,0.126602,0.0,0.115657,0.204575,...,0.32322,0.131743,0.405537,0.0,0.0,0.54805,0.42912,0.14124,0.133467,0.147699
4,0.0,0.06438,0.075184,0.0,0.071809,0.0,0.818984,0.0,0.0,0.055141,...,0.479166,0.0,0.072873,0.219353,0.0,0.118178,0.07711,0.0,0.07195,0.0


In [125]:
df_joined_tfidfvector.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42800 entries, 0 to 42799
Data columns (total 22 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   did         42800 non-null  float64
 1   donald      42800 non-null  float64
 2   government  42800 non-null  float64
 3   house       42800 non-null  float64
 4   just        42800 non-null  float64
 5   like        42800 non-null  float64
 6   new         42800 non-null  float64
 7   news        42800 non-null  float64
 8   people      42800 non-null  float64
 9   president   42800 non-null  float64
 10  republican  42800 non-null  float64
 11  reuters     42800 non-null  float64
 12  said        42800 non-null  float64
 13  state       42800 non-null  float64
 14  states      42800 non-null  float64
 15  time        42800 non-null  float64
 16  told        42800 non-null  float64
 17  trump       42800 non-null  float64
 18  united      42800 non-null  float64
 19  washington  42800 non-nul

In [None]:
# # Create pipeline
# model = Pipeline([
#     ('vectorizer', TfidfVectorizer(lowercase=True, ngram_range=(1,1))),
#     ('dim_red', TruncatedSVD(n_components=50, random_state=42)),
#     ('predictor', GradientBoostingClassifier(random_state=42))
# ])

# # Fit model to training data
# model.fit(X_train, y_train); 

# REMEMBER TO GET A VALIDATION SPLIT AND CHECK IT AGAINST THAT. 