In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbs
import re, string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, log_loss, classification_report, accuracy_score,  roc_auc_score, roc_curve, auc
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
import pickle
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder

In [26]:
data = pd.read_csv('fake_and_real_news.csv')
data.head()

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [27]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9900 entries, 0 to 9899
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    9900 non-null   object
 1   label   9900 non-null   object
dtypes: object(2)
memory usage: 154.8+ KB


In [28]:
data.isnull().sum()

Text     0
label    0
dtype: int64

In [29]:
data['label'].value_counts()

Fake    5000
Real    4900
Name: label, dtype: int64

In [30]:
encoder = LabelEncoder()
y = encoder.fit_transform(data['label'])



In [31]:
port_stemmer= PorterStemmer()
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
punc_list = set(string.punctuation)
stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [32]:
def preprocessing(text):
    text_list = word_tokenize(text.lower())
    text_list = [word for word in text_list if word not in stop_words]
    text_list = [word for word in text_list if word not in punc_list and word.isalpha()]
    text_list = [stemmer.stem(word) for word in text_list]
    text_list = [lemmatizer.lemmatize(word) for word in text_list]

    return ' '.join(text_list)

In [33]:
data['Text'] = data['Text'].apply(preprocessing)

In [34]:
x = data['Text']
y = data['label']

In [35]:
X_train, X_test, y_train, y_test= train_test_split(x,y, test_size=0.2, random_state=42)

In [36]:
X_test.shape

(1980,)

In [37]:
y_test.shape

(1980,)

In [38]:
import pickle

vc = TfidfVectorizer()
X_train = vc.fit_transform(X_train)
X_test = vc.transform(X_test)
# pipeline = Pipeline([
#     ('tfidf', TfidfVectorizer()),
#     ('logreg', LogisticRegression(max_iter=1000))
# ])

# pipeline.fit(X_train,y_train)

In [39]:

model=SVC()
model.fit(X_train,y_train)

In [40]:
y_pred = model.predict(X_test)
acc = accuracy_score(y_pred, y_test)
print(f"Accuracy: {acc*100}")
print(classification_report(y_test, y_pred))

Accuracy: 99.4949494949495
              precision    recall  f1-score   support

        Fake       1.00      0.99      0.99       973
        Real       0.99      1.00      1.00      1007

    accuracy                           0.99      1980
   macro avg       0.99      0.99      0.99      1980
weighted avg       0.99      0.99      0.99      1980



In [41]:
model_data = {
    'vectorizer': vc,
    'model': model
}

In [42]:
with open('nlp_pipeline.pkl', 'wb') as file:
    pickle.dump(model_data, file)

In [45]:
def pred(text):
    text = preprocessing(text)
    text = vc.transform([text])
    val = model.predict(text)
    print(f"Category : {val[0]}")

In [47]:
pred('Senate race in Alabama exposes Republican rift (Reuters) - U.S. President Donald Trump is due to travel to Alabama on Friday to whip up votes in a Senate race that has laid bare the rift between the Republican party’s leadership and its anti-establishment wing. In a move that surprised political analysts, Trump will appear alongside the party-approved incumbent rather than the controversial insurgent. The president’s decision to campaign for Senator Luther Strange ahead of Tuesday’s primary runoff election came despite Trump’s strained relationship with Republican Senate Majority Leader Mitch McConnell, one of Strange’s strongest supporters.  In last month’s initial round of voting, Strange finished second to former Alabama Chief Justice Roy Moore, whose “drain the swamp” credentials have been burnished by high-profile endorsements, including that of former White House strategist Steve Bannon. “I think that if you were to approach this without any prior knowledge ... Judge Moore would be the more logical ‘Trumpian’ candidate,” said Steven Taylor, a political science professor at Troy University in Alabama. The winner on Tuesday will be running in December against Democratic candidate Doug Jones in the Republican-dominated state. Neither Republican campaign responded to requests for comment. Moore, 70, is a religious conservative who twice lost his position as the state’s top judge. He was ousted in 2003 after he refused a federal court order to remove a Ten Commandments monument from the state Supreme Court building. After winning back the seat in 2013, he directed state judges to ignore the U.S. Supreme Court’s ruling legalizing same-sex marriage, was suspended and then resigned. Moore has vowed to support Trump’s agenda and has blamed McConnell for the Senate’s lack of major legislative victories. A Moore campaign video released last week showed a photo of McConnell beside the words “Washington insiders” as Moore’s wife warned he would not “bend his knee.” Strange, 64, was appointed to fill the Senate seat left vacant when Jeff Sessions became U.S. attorney general. Known as “Big Luther” due to his 6-foot-9-inch stature, he has been backed by nearly $9 million of advertising from a McConnell-allied political action committee. “I am supporting ‘Big’ Luther Strange because he was so loyal & helpful to me!” Trump tweeted on Wednesday. Strange served as state attorney general under former Governor Robert Bentley, who resigned in April. Moore has endured his share of controversy, including negative press this week after he mentioned “reds” and “yellows” in an apparent reference to American Indians and Asians.  In a recent advertisement, Strange questioned Moore’s commitment to Trump’s planned wall at the Mexican border. “Roy Moore: wrong on President Trump, wrong on the wall, wrong for Alabama,” the ad’s narrator says.     The two candidates are scheduled to square off at their only debate on Thursday. With turnout expected to be low for Tuesday’s runoff, Troy University’s Taylor said both campaigns have reason to believe they hold an edge. “Moore’s advantage is that he does appeal to the true-believing base,” Taylor said. “In terms of which of the campaigns is going to be more adept at the ground game, I would think Strange has an advantage.')

Category : Real
