In [1]:
import numpy as np
import pandas as pd

In [2]:
dataset=pd.read_csv('clean_text_news.csv')


In [3]:
dataset["year"]=dataset.date.str.split(',').str[1]

In [4]:
dataset.year.fillna('2018',inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset.year.fillna('2018',inplace=True)


In [5]:
dataset.year=dataset.year.str.strip()

In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44689 entries, 0 to 44688
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   date        44689 non-null  object
 1   IsFake      44689 non-null  int64 
 2   title_text  44689 non-null  object
 3   year        44689 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.4+ MB


In [7]:
dataset.year=dataset.year.astype(np.int64)

In [8]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44689 entries, 0 to 44688
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   date        44689 non-null  object
 1   IsFake      44689 non-null  int64 
 2   title_text  44689 non-null  object
 3   year        44689 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 1.4+ MB


In [9]:
dataset.drop('date',inplace=True,axis=1)

In [10]:
dataset.head()

Unnamed: 0,IsFake,title_text,year
0,0,u budget fight loom republican flip fiscal scr...,2017
1,0,u militari accept transgend recruit monday pen...,2017
2,0,senior u republican senat let mr mueller job w...,2017
3,0,fbi russia probe help australian diplomat tip ...,2017
4,0,trump want postal servic charg much amazon shi...,2017


In [11]:
corpus=dataset.title_text.values

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer=TfidfVectorizer(max_features=25000,min_df=5,max_df=0.7)
vectorized_arr=vectorizer.fit_transform(corpus).toarray()

In [13]:
vectorized_arr.shape

(44689, 25000)

In [14]:
from sklearn.decomposition import TruncatedSVD
svd=TruncatedSVD(n_components=300)
reduced_vec_arr=svd.fit_transform(vectorized_arr)

In [16]:
train_set=pd.DataFrame(reduced_vec_arr)

In [17]:
train_set['year']=dataset.year

In [20]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44689 entries, 0 to 44688
Columns: 301 entries, 0 to year
dtypes: float64(300), int64(1)
memory usage: 102.6 MB


In [22]:
train_set.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,year
0,0.363875,-0.012413,-0.279566,0.259308,-0.10694,-0.021839,-0.017721,0.142037,-0.00944,-0.024391,...,0.023386,-0.008769,-0.015133,0.002838,-0.006858,-0.004907,0.018868,-0.010167,-0.021797,2017
1,0.255102,-0.052546,-0.03663,-0.013458,-0.02649,-0.066599,-0.090976,-0.168911,0.012101,-0.139682,...,-0.016115,0.010415,0.025954,-0.033427,0.026025,0.01725,-0.020288,0.031084,-0.026674,2017
2,0.363986,0.109561,0.09822,0.132986,0.237107,-0.151706,-0.071024,-0.033271,0.068185,0.063403,...,0.029391,0.006182,-0.009334,-0.04481,0.005898,0.000789,0.022553,-6e-05,-0.008363,2017
3,0.296768,0.051377,0.100897,0.062956,0.225129,-0.120831,-0.047313,-0.002987,0.077732,-0.015782,...,-0.017609,-0.007843,-0.008278,0.002842,-0.005769,0.012562,-0.004454,0.016273,0.018697,2017
4,0.161978,-0.021273,-0.036325,0.011431,-0.025894,-0.016751,0.001928,0.062497,0.045934,-0.066738,...,0.001915,0.011996,-0.017509,-0.04092,-0.028302,-0.007682,0.019122,0.005741,0.010976,2017


In [23]:
train_set["IsFake"]=dataset.IsFake

In [24]:
train_set.to_csv("train.csv",header=True,index=False)