In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\devan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Load the dataset
train_data =pd.read_csv('train.csv', engine='python', encoding='utf-8')
test_data =pd.read_csv('test.csv', engine='python', encoding='utf-8')

In [4]:
train_data.head()

Unnamed: 0,title,author,text,label,Unnamed: 4,Unnamed: 5
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,,
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,,
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,,
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,,
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,,


In [5]:
train_data.shape

(2501, 6)

In [6]:
train_data.isnull().sum()

title           65
author         253
text             7
label            0
Unnamed: 4    2491
Unnamed: 5    2500
dtype: int64

In [7]:
train_data = train_data.fillna('')

In [8]:
def clean_text(text):
    if isinstance(text, str):
        # Convert text to lowercase
        text = text.lower()
        # Remove special characters and punctuation
        text = re.sub(r'[^\w\s]', '', text)
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        words = text.split()
        words = [word for word in words if word not in stop_words]
        # Join the cleaned words back into a string
        cleaned_text = ' '.join(words)
        return cleaned_text
    else:
        return ''


In [9]:
train_data['cleaned_text'] = train_data['text'].apply(clean_text)
test_data['cleaned_text'] = test_data['text'].apply(clean_text)

In [10]:
X_train = train_data['cleaned_text']
y_train = train_data['label']
X_test = test_data['cleaned_text']
y_test = test_data['label']


In [11]:
# Create a TfidfVectorizer object to transform text into numerical features
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

In [12]:
X_train_vec = vectorizer.fit_transform(X_train)

In [13]:
svm = SVC()
param_grid = {'kernel': ['linear', 'rbf'], 'C': [0.1, 1, 10], 'gamma': ['scale', 'auto']}


In [14]:
grid_search = GridSearchCV(svm, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train_vec, y_train)



In [15]:
print('Best hyperparameters:', grid_search.best_params_)

Best hyperparameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}


In [16]:
best_svm = grid_search.best_estimator_
X_test_vec = vectorizer.transform(X_test)
y_pred = best_svm.predict(X_test_vec)


In [17]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.916


Validation

In [18]:
val = pd.read_csv('val.csv')
val.head()

Unnamed: 0,title,author,text,label
0,Four ways Bob Corker skewered Donald Trump,Anthony Zurcher,"\nOn Sunday morning, Donald Trump went off on ...",0
1,Linklater's war veteran comedy speaks to moder...,"Robin Pomeroy, Edward Baran","LONDON (Reuters) - “Last Flag Flying”, a comed...",0
2,JetNation FanDuel League; Week 4,,JetNation FanDuel League; Week 4\n% of readers...,1


In [19]:
val['clean_text'] = val.text.apply(clean_text)
val.head()

Unnamed: 0,title,author,text,label,clean_text
0,Four ways Bob Corker skewered Donald Trump,Anthony Zurcher,"\nOn Sunday morning, Donald Trump went off on ...",0,sunday morning donald trump went twitter tirad...
1,Linklater's war veteran comedy speaks to moder...,"Robin Pomeroy, Edward Baran","LONDON (Reuters) - “Last Flag Flying”, a comed...",0,london reuters last flag flying comedydrama vi...
2,JetNation FanDuel League; Week 4,,JetNation FanDuel League; Week 4\n% of readers...,1,jetnation fanduel league week 4 readers think ...


In [20]:
val_vec = vectorizer.transform(val['clean_text'])
pred = best_svm.predict(val_vec)
pred

array(['0', '0', '1'], dtype=object)

In [35]:
clean_text

<function __main__.clean_text(text)>

Deployment

In [58]:
import pickle
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)
with open('svm_model.pkl', 'wb') as f:
    pickle.dump(best_svm, f)
with open('clean_text.pkl', 'wb') as f:
    pickle.dump(clean_text, f)
