# 4.4.2 Improving your model

In [27]:
# Import the necessary modules
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
# # Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.feature_extraction.text import CountVectorizer

from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB 

df=pd.DataFrame.from_csv('fake_or_real_news.csv')

In [28]:
# Print the head of df
print(df.head())

# Create a series to store the labels: y
y = df.label

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(
                                             df['text'], y, 
                                             test_size=0.33, 
                                             random_state=53)

                                                   title  \
8476                        You Can Smell Hillary’s Fear   
10294  Watch The Exact Moment Paul Ryan Committed Pol...   
3608         Kerry to go to Paris in gesture of sympathy   
10142  Bernie supporters on Twitter erupt in anger ag...   
875     The Battle of New York: Why This Primary Matters   

                                                    text label  
8476   Daniel Greenfield, a Shillman Journalism Fello...  FAKE  
10294  Google Pinterest Digg Linkedin Reddit Stumbleu...  FAKE  
3608   U.S. Secretary of State John F. Kerry said Mon...  REAL  
10142  — Kaydee King (@KaydeeKing) November 9, 2016 T...  FAKE  
875    It's primary day in New York and front-runners...  REAL  


Count Vectors

In [29]:
# Initialize a TfidfVectorizer object: tfidf_vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7)

# Transform the training data: tfidf_train 
tfidf_train = tfidf_vectorizer.fit_transform(X_train.values)

# Transform the test data: tfidf_test 
# DO NOT FIT TEST DATA because test data may include NEW WORDS
tfidf_test = tfidf_vectorizer.transform(X_test.values)

# Print the first 10 features
print(tfidf_vectorizer.get_feature_names()[:10])

# Print the first 5 vectors of the tfidf training data
print(tfidf_train[:5])

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


['00', '000', '0000', '00000031', '000035', '00006', '0001', '0001pt', '000ft', '000km']
  (1, 42470)	0.07711040274149526
  (1, 12105)	0.15008066461476866
  (1, 54177)	0.13782629144711137
  (1, 50628)	0.061296988343109586
  (1, 15924)	0.3479045460649079
  (1, 44520)	0.4973826512693341
  (1, 51896)	0.11596517664605868
  (1, 35783)	0.30902690818827977
  (1, 35256)	0.12628385718450857
  (1, 21881)	0.21271688045815978
  (1, 42534)	0.06081715886809217
  (1, 8399)	0.08729542880625335
  (1, 29531)	0.1454406205718245
  (1, 15927)	0.4973826512693341
  (1, 25686)	0.13550453594288983
  (1, 49203)	0.1672740861784377
  (1, 16814)	0.10404977746548139
  (1, 36087)	0.12648679854389897
  (1, 21568)	0.1007920919566398
  (1, 25684)	0.1030420922189754
  (1, 38823)	0.06048803110658644
  (1, 47506)	0.14539060877460044
  (1, 36831)	0.10772488937433067
  (2, 16972)	0.1606296088662543
  (2, 762)	0.48803966069171073
  :	:
  (4, 19325)	0.05452053080897492
  (4, 7259)	0.06755319386644243
  (4, 51456)	0.0647535378

In [30]:
# Create the list of alphas: alphas
alphas = np.arange(0,1,0.1)

# Define train_and_predict()
def train_and_predict(alpha):
    # Instantiate the classifier: nb_classifier
    nb_classifier = MultinomialNB(alpha=alpha)
    # Fit to the training data
    nb_classifier.fit(tfidf_train, y_train)
    # Predict the labels: pred
    pred = nb_classifier.predict(tfidf_test)
    # Compute accuracy: score
    score = metrics.accuracy_score(y_test, pred)
    return score

# Iterate over the alphas and print the corresponding score
accuracy_dict={}
for alpha in alphas:
    print('Alpha: ', alpha)
    accuracy=train_and_predict(alpha)
    print('Score: ', accuracy)
    print()
    accuracy_dict[str(alpha)]=accuracy

Alpha:  0.0
Score:  0.6150167384026781

Alpha:  0.1
Score:  0.8976566236250598

Alpha:  0.2
Score:  0.8938307030129125

Alpha:  0.30000000000000004
Score:  0.8900047824007652

Alpha:  0.4
Score:  0.8857006217120995

Alpha:  0.5
Score:  0.8842659014825442

Alpha:  0.6000000000000001
Score:  0.874701099952176

Alpha:  0.7000000000000001


  self.feature_log_prob_ = (np.log(smoothed_fc) -


Score:  0.8703969392635102

Alpha:  0.8
Score:  0.8660927785748446

Alpha:  0.9
Score:  0.8589191774270684



In [31]:
accuracy_dict

{'0.0': 0.6150167384026781,
 '0.1': 0.8976566236250598,
 '0.2': 0.8938307030129125,
 '0.30000000000000004': 0.8900047824007652,
 '0.4': 0.8857006217120995,
 '0.5': 0.8842659014825442,
 '0.6000000000000001': 0.874701099952176,
 '0.7000000000000001': 0.8703969392635102,
 '0.8': 0.8660927785748446,
 '0.9': 0.8589191774270684}