In [2]:
# svm has proved better performance. I check to see if I can improve its performance.

In [36]:
import pandas as pd
import numpy as np
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,confusion_matrix,make_scorer,classification_report
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score


In [4]:
data=pd.read_csv('news_data.csv')

In [5]:
data.rename(columns={'Unnamed: 0':'news_id','text':'news_details'},inplace=True)

In [6]:
data['label']=data['label'].map({'FAKE':0,'REAL':1})

In [7]:
def convertlower(s):
    return s.lower()
def removeWhiteSpaces(s):
    return s.strip()
def removePunctuations(s):
    return s.translate(str.maketrans('','',string.punctuation))
def removeNumbers(s):
    return re.sub(r'\d+','',s)
def removeURL(s):
    return re.sub(r'^https?:\/\/.*[\r\n]*','',s)
def removeTags(s):
    remove_tags=re.compile('<.*?>')
    return re.sub(remove_tags,'',s)


In [8]:
col_to_clean=['title','news_details']
for col in col_to_clean:
    data[col]=data[col].apply(convertlower)
    data[col]=data[col].apply(removeWhiteSpaces)
    data[col]=data[col].apply(removePunctuations)
    data[col]=data[col].apply(removeNumbers)
    data[col]=data[col].apply(removeURL)
    data[col]=data[col].apply(removeTags)

In [9]:
tfidf=TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS,max_df=0.8,min_df=0.2)

In [10]:
tfidf_matrix_news_details=tfidf.fit_transform(data['news_details'])

In [11]:
tfidf_title=TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS,max_df=0.8,min_df=0.01)

In [24]:
X_train,X_test,y_train,y_test=train_test_split(tfidf_matrix_news_details,np.array(data['label']),test_size=0.2,random_state=42)

In [25]:
def svmclf(clf,X,y_true):
    y_pred=clf.predict(X) 
    score=f1_score(y_true,y_pred,average='weighted')
    cm=pd.DataFrame(confusion_matrix(y_true,y_pred),columns=['0','1'],index=['0','1'])
    return score,cm

In [14]:
clf=SVC()
clf.fit(X_train,y_train)

SVC()

In [15]:
score_test,cm_test=svmclf(clf,X_test,y_test)
score_train,cm_train=svmclf(clf,X_train,y_train)

In [16]:
score_test,score_train

(0.8500168547934291, 0.9190780184495986)

In [17]:
f1=make_scorer(f1_score,average='weighted')

In [18]:
grid_params={
    'kernel':['linear','poly','rbf'],
    'C':[0.1,1,10,100,1000],
    'gamma':[0.0001,0.001,0.01,0.1,1],
    }
gridclf=GridSearchCV(SVC(class_weight='balanced'),param_grid=grid_params,cv=10,return_train_score=False,scoring=f1)
gridclf.fit(tfidf_matrix_news_details,np.array(data['label']))

GridSearchCV(cv=10, estimator=SVC(class_weight='balanced'),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [0.0001, 0.001, 0.01, 0.1, 1],
                         'kernel': ['linear', 'poly', 'rbf']},
             scoring=make_scorer(f1_score, average=weighted))

In [19]:
gridclf.best_score_

0.843311453484177

In [33]:
gridclf.best_params_

{'C': 1, 'gamma': 1, 'kernel': 'rbf'}

In [34]:
class_labels=['fake','real']

In [30]:
grid_predictions=gridclf.predict(X_test)

In [35]:
print(classification_report(y_test,grid_predictions,target_names=class_labels))

              precision    recall  f1-score   support

        fake       0.89      0.92      0.90       628
        real       0.91      0.89      0.90       639

    accuracy                           0.90      1267
   macro avg       0.90      0.90      0.90      1267
weighted avg       0.90      0.90      0.90      1267

