# TF-IDF Feature Extraction
This notebook performs TF-IDF based feature extraction in preprocessed news text.
The notebook is intentionally self-ontained for clarity and reproducibilty.

In [9]:
import pandas as pd
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
df_fake = pd.read_csv("../data/Fake.csv")
df_true = pd.read_csv("../data/True.csv")

In [11]:
df_fake.head

<bound method NDFrame.head of                                                    title  \
0       Donald Trump Sends Out Embarrassing New Year’...   
1       Drunk Bragging Trump Staffer Started Russian ...   
2       Sheriff David Clarke Becomes An Internet Joke...   
3       Trump Is So Obsessed He Even Has Obama’s Name...   
4       Pope Francis Just Called Out Donald Trump Dur...   
...                                                  ...   
23476  McPain: John McCain Furious That Iran Treated ...   
23477  JUSTICE? Yahoo Settles E-mail Privacy Class-ac...   
23478  Sunnistan: US and Allied ‘Safe Zone’ Plan to T...   
23479  How to Blow $700 Million: Al Jazeera America F...   
23480  10 U.S. Navy Sailors Held by Iranian Military ...   

                                                    text      subject  \
0      Donald Trump just couldn t wish all Americans ...         News   
1      House Intelligence Committee Chairman Devin Nu...         News   
2      On Friday, it was revea

In [12]:
df_fake["label"] = 0
df_true["label"] = 1

In [13]:
data = pd.concat([df_fake, df_true])
data = data.sample(frac=1)
data = data.reset_index(drop=True)

In [14]:
data = data[["label","text"]].copy()

In [15]:
def cleantext(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]','', text)
    text = re.sub(r'https?://\S+|www\.\S+','', text)
    text = re.sub(r'<.*?>+','', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation),'', text)
    text = re.sub(r'\n','',text)
    text = re.sub(r'\w*\d\w*','',text)
    return  text

In [16]:
data["text"] = data["text"].apply(cleantext)

In [17]:
data["label"].value_counts()

label
0    23481
1    21417
Name: count, dtype: int64

In [18]:
X = data["text"]
y = data["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 42)

In [19]:
vectorizer = TfidfVectorizer(
    stop_words= "english",
    max_df= 0.7
)

In [20]:
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [21]:
X_train_tfidf.shape

(35918, 179173)