In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(
    max_features=7000,
    ngram_range=(1, 2),
    min_df=5,
    stop_words='english'
)

In [3]:
df = pd.read_csv('data/cleaned_combined.csv')

In [4]:
df = df.reindex(['clean_text', 'target'], axis=1)
df

Unnamed: 0,clean_text,target
0,national security adviser brzezinski die reute...,1
1,trump back conspiracy theory say hillary clint...,0
2,trump meet australian pm relation strain asylu...,1
3,senate clear way billion arm sale saudi arabia...,1
4,factbox trump twitter oct tax cut united kingd...,1
...,...,...
39095,watch malcolm nance wipe floor breitbart edito...,0
39096,trump target obamas climate initiative white h...,1
39097,germany refuse recognize catalonia independenc...,1
39098,new handgun design fold like smartphonebut goo...,0


In [5]:
df[df.isna().any(axis=1)]

Unnamed: 0,clean_text,target


In [6]:
X = tfidf.fit_transform(df['clean_text'])

In [7]:
y = df['target'].values

In [8]:
print(f"TF-IDF matrix shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Sample feature names: {tfidf.get_feature_names_out()[:10]}")

TF-IDF matrix shape: (39100, 7000)
Target shape: (39100,)
Sample feature names: ['abadi' 'abandon' 'abbas' 'abbott' 'abc' 'abc news' 'abdullah' 'abe'
 'abedin' 'abide']


In [9]:
X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 5186979 stored elements and shape (39100, 7000)>

In [10]:
y

array([1, 0, 1, ..., 1, 0, 1], dtype=int64)

In [11]:
from scipy.sparse import save_npz
import numpy as np
save_npz('data/X.npz', X) 
np.savetxt('data/y.csv', y, fmt='%d', delimiter=',')

In [12]:
import joblib
import os 

os.makedirs('models', exist_ok=True)
joblib.dump(tfidf, 'models/verifai_vectorizer.pkl')

['models/verifai_vectorizer.pkl']