<a href="https://colab.research.google.com/github/SoorajSundar1505/MachineLearning/blob/master/Text_Data_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

In [None]:
nltk.download('stopwords')

In [None]:
#Load the data to pandas
news_data = pd.read_csv('/content/drive/MyDrive/ML_DataSets/fake_news_dataset.csv')
news_data.head()

In [None]:
#Checking for missing values
news_data.isnull().sum()

In [6]:
#replacing missing value with null string
news_data= news_data.fillna('')

In [23]:
#merging authorname and news title
news_data['content'] = news_data['author']+' '+news_data['title']
print(news_data['content'])

0        Darrell Lucus House Dem Aide: We Didn’t Even S...
1        Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo...
2        Consortiumnews.com Why the Truth Might Get You...
3        Jessica Purkiss 15 Civilians Killed In Single ...
4        Howard Portnoy Iranian woman jailed for fictio...
                               ...                        
20795    Jerome Hudson Rapper T.I.: Trump a ’Poster Chi...
20796    Benjamin Hoffman N.F.L. Playoffs: Schedule, Ma...
20797    Michael J. de la Merced and Rachel Abrams Macy...
20798    Alex Ansary NATO, Russia To Hold Parallel Exer...
20799              David Swanson What Keeps the F-35 Alive
Name: content, Length: 20800, dtype: object


In [None]:
#Seperating feature and target
# feature - content, target - label

x = news_data.drop(columns='label', axis=1)
y = news_data['label']
print(y)

In [28]:
#Stemming - The process of reducing words into root word
stem = PorterStemmer()

def stemming(content):
  stem_content = re.sub('[^a-zA-Z]',' ','content')
  stem_content=stem_content.lower()
  stem_content=stem_content.split()
  stem_content=[stem.stem(word) for word in stem_content if not word in stopwords.words('english')]
  stem_content=' '.join(stem_content)
  return stem_content



In [29]:
#Apply stemming function
news_data['content'] = news_data['content'].apply(stemming)


In [31]:
print(news_data['content'])

0        content
1        content
2        content
3        content
4        content
          ...   
20795    content
20796    content
20797    content
20798    content
20799    content
Name: content, Length: 20800, dtype: object


In [27]:
x = news_data['content'].values
y = news_data['label'].values

In [None]:
#Converting text data to feature vectors
vectorizer = TfidfVectorizer()
vectorizer.fit(x)
x = vectorizer.transform(x)
print(x)

In [33]:
#train and test
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=2)
print(x.shape, x_train.shape, x_test.shape)

(20800, 25173) (16640, 25173) (4160, 25173)


In [35]:
#training the model

classifier = svm.SVC(kernel='linear')
classifier.fit(x_train,y_train)

In [36]:
#Evaluate the model, find the accuracy score
x_train_prediction = classifier.predict(x_train)
x_train_prediction_accuracy = accuracy_score(x_train_prediction,y_train)
print(x_train_prediction_accuracy)

0.9979567307692307


In [37]:
#testing

prediction = classifier.predict(x_test)
x_test_prediction_accuracy = accuracy_score(prediction,y_test)
print(x_test_prediction_accuracy)

0.9925480769230769


In [None]:
input