# Fake News Detection

## Task 1

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# Load the datasets
fake_train = pd.read_csv('Fake_train.csv')
fake_dev = pd.read_csv('Fake_dev.csv')

In [3]:
# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(fake_train['text'])
y_train = fake_train['label']

In [4]:
X_dev = vectorizer.transform(fake_dev['text'])
y_dev = fake_dev['label']

### Naive Bayes

In [5]:
# Initialize a Naive Bayes classifier
classifier_nb = MultinomialNB()
classifier_nb.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [6]:
# Predict on the development set
predictions_nb = classifier_nb.predict(X_dev)

In [8]:
results_nb = pd.DataFrame({
    'Text': fake_dev['text'],
    'Actual': y_dev,
    'Predicted': predictions_nb
})

In [9]:
# Calculate the accuracy
accuracy_nb = accuracy_score(y_dev, predictions_nb)
print(f"Accuracy: {accuracy_nb}")

Accuracy: 0.7889570552147239


In [10]:
from colorama import Fore, Style

print("Actual  Predicted")
for actual, predicted in zip(results_nb['Actual'], results_nb['Predicted']):
    if actual == predicted:
        print(f"{actual:<8} {predicted:<10}")
    else:
        print(f"{Fore.RED}{actual:<8}{Style.RESET_ALL} {Fore.RED}{predicted:<10}{Style.RESET_ALL}")

Actual  Predicted
Fake     Fake      
Fake     Fake      
[31moriginal[0m [31mFake      [0m
original original  
[31moriginal[0m [31mFake      [0m
Fake     Fake      
[31moriginal[0m [31mFake      [0m
[31moriginal[0m [31mFake      [0m
[31moriginal[0m [31mFake      [0m
Fake     Fake      
Fake     Fake      
[31moriginal[0m [31mFake      [0m
[31mFake    [0m [31moriginal  [0m
Fake     Fake      
Fake     Fake      
original original  
original original  
Fake     Fake      
original original  
Fake     Fake      
original original  
[31mFake    [0m [31moriginal  [0m
Fake     Fake      
Fake     Fake      
original original  
[31moriginal[0m [31mFake      [0m
Fake     Fake      
Fake     Fake      
original original  
original original  
Fake     Fake      
original original  
original original  
Fake     Fake      
Fake     Fake      
Fake     Fake      
original original  
[31moriginal[0m [31mFake      [0m
original original  
Fake     Fake      
Fa

### SVM

In [11]:
classifier_svm = SVC(kernel='rbf')
classifier_svm.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [12]:
# Predict on the development set
predictions_svm = classifier_svm.predict(X_dev)

In [13]:
results_svm = pd.DataFrame({
    'Text': fake_dev['text'],
    'Actual': y_dev,
    'Predicted': predictions_svm
})

In [14]:
# Calculate the accuracy
accuracy_svm = accuracy_score(y_dev, predictions_svm)
print(f"Accuracy: {accuracy_svm}")

Accuracy: 0.501840490797546


In [15]:
print("Actual  Predicted")
for actual, predicted in zip(results_svm['Actual'], results_svm['Predicted']):
    if actual == predicted:
        print(f"{actual:<8} {predicted:<10}")
    else:
        print(f"{Fore.RED}{actual:<8}{Style.RESET_ALL} {Fore.RED}{predicted:<10}{Style.RESET_ALL}")

Actual  Predicted
[31mFake    [0m [31moriginal  [0m
[31mFake    [0m [31moriginal  [0m
original original  
original original  
original original  
[31mFake    [0m [31moriginal  [0m
original original  
original original  
original original  
[31mFake    [0m [31moriginal  [0m
[31mFake    [0m [31moriginal  [0m
original original  
[31mFake    [0m [31moriginal  [0m
[31mFake    [0m [31moriginal  [0m
[31mFake    [0m [31moriginal  [0m
original original  
original original  
[31mFake    [0m [31moriginal  [0m
original original  
[31mFake    [0m [31moriginal  [0m
original original  
[31mFake    [0m [31moriginal  [0m
[31mFake    [0m [31moriginal  [0m
[31mFake    [0m [31moriginal  [0m
original original  
original original  
[31mFake    [0m [31moriginal  [0m
[31mFake    [0m [31moriginal  [0m
original original  
original original  
[31mFake    [0m [31moriginal  [0m
original original  
original original  
[31mFake    [0m [31moriginal  

### Random Forest

In [16]:
param_grid = {'n_estimators': [50, 100, 200],
              'max_depth': [None, 10, 20, 30],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4]}

In [17]:
classifier = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [18]:
# Predict on the development set
best_classifier_rf = grid_search.best_estimator_
predictions_rf = best_classifier_rf.predict(X_dev)

In [19]:
results_rf = pd.DataFrame({
    'Text': fake_dev['text'],
    'Actual': y_dev,
    'Predicted': predictions_rf
})

In [20]:
# Calculate the accuracy
accuracy_rf = accuracy_score(y_dev, predictions_rf)
print(f"Accuracy: {accuracy_rf}")

Accuracy: 0.7558282208588957


In [21]:
print("Actual  Predicted")
for actual, predicted in zip(results_rf['Actual'], results_rf['Predicted']):
    if actual == predicted:
        print(f"{actual:<8} {predicted:<10}")
    else:
        print(f"{Fore.RED}{actual:<8}{Style.RESET_ALL} {Fore.RED}{predicted:<10}{Style.RESET_ALL}")

Actual  Predicted
[31mFake    [0m [31moriginal  [0m
Fake     Fake      
[31moriginal[0m [31mFake      [0m
original original  
[31moriginal[0m [31mFake      [0m
[31mFake    [0m [31moriginal  [0m
[31moriginal[0m [31mFake      [0m
original original  
[31moriginal[0m [31mFake      [0m
Fake     Fake      
Fake     Fake      
[31moriginal[0m [31mFake      [0m
[31mFake    [0m [31moriginal  [0m
Fake     Fake      
Fake     Fake      
original original  
original original  
Fake     Fake      
original original  
Fake     Fake      
original original  
[31mFake    [0m [31moriginal  [0m
[31mFake    [0m [31moriginal  [0m
Fake     Fake      
original original  
original original  
Fake     Fake      
Fake     Fake      
original original  
original original  
Fake     Fake      
original original  
original original  
Fake     Fake      
Fake     Fake      
[31mFake    [0m [31moriginal  [0m
original original  
original original  
original original  
[31

### Logistic Regression

In [24]:
param_grid_lr = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'solver': ['liblinear', 'lbfgs', 'newton-cg', 'sag']}

In [25]:
classifier_lr = LogisticRegression(max_iter=1000, random_state=42)
grid_search_lr = GridSearchCV(classifier_lr, param_grid_lr, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_lr.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100], 'solver': ['liblinear', 'lbfgs', 'newton-cg', 'sag']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [26]:
# Predict on the development set
best_classifier_lr = grid_search_lr.best_estimator_
predictions_lr = best_classifier_lr.predict(X_dev)

In [27]:
results_lr = pd.DataFrame({
    'Text': fake_dev['text'],
    'Actual': y_dev,
    'Predicted': predictions_lr
})

In [28]:
# Calculate the accuracy
accuracy_lr = accuracy_score(y_dev, predictions_lr)
print(f"Accuracy: {accuracy_lr}")

Accuracy: 0.7865030674846626


In [29]:
print("Actual  Predicted")
for actual, predicted in zip(results_lr['Actual'], results_lr['Predicted']):
    if actual == predicted:
        print(f"{actual:<8} {predicted:<10}")
    else:
        print(f"{Fore.RED}{actual:<8}{Style.RESET_ALL} {Fore.RED}{predicted:<10}{Style.RESET_ALL}")

Actual  Predicted
Fake     Fake      
Fake     Fake      
[31moriginal[0m [31mFake      [0m
[31moriginal[0m [31mFake      [0m
[31moriginal[0m [31mFake      [0m
[31mFake    [0m [31moriginal  [0m
[31moriginal[0m [31mFake      [0m
original original  
original original  
Fake     Fake      
Fake     Fake      
[31moriginal[0m [31mFake      [0m
[31mFake    [0m [31moriginal  [0m
Fake     Fake      
Fake     Fake      
original original  
original original  
Fake     Fake      
original original  
Fake     Fake      
original original  
[31mFake    [0m [31moriginal  [0m
Fake     Fake      
Fake     Fake      
original original  
[31moriginal[0m [31mFake      [0m
Fake     Fake      
Fake     Fake      
original original  
original original  
Fake     Fake      
original original  
original original  
Fake     Fake      
Fake     Fake      
Fake     Fake      
original original  
original original  
original original  
Fake     Fake      
Fake     Fake      


### Inference 

In [37]:
print("NB Accuracy: ",accuracy_nb)
print("SVM Accuracy: ",accuracy_svm)
print("RF Accuracy: ",accuracy_rf)
print("LR Accuracy: ",accuracy_lr)

NB Accuracy:  0.788957055215
SVM Accuracy:  0.501840490798
RF Accuracy:  0.755828220859
LR Accuracy:  0.786503067485


### Applying NB in test data which is without label.

In [38]:
test_without_label = pd.read_csv('Fake_test_without_labels.csv')

In [39]:
x = vectorizer.transform(test_without_label['text'])

In [40]:
predictions_without_label_nb = classifier_nb.predict(x)

In [41]:
results_without_label_nb = pd.DataFrame({
    'Text': test_without_label['text'],
    'Predicted': predictions_without_label_nb
})

In [42]:
results_without_label_nb

Unnamed: 0,Predicted,Text
0,Fake,5000 ഉള്ള പോൾ ലോഗ്‌ഡ്‌വൻ ഇപ്പോള് 250000 എന്താ...
1,original,ഓഷോ രജനീഷ് പറഞ്ഞപോലെ എനിക്കപ്പോൾ തോന്നിയത് അ...
2,Fake,ചേട്ടാ വാർത്ത വയ്ക്കുന്നത് കേരളത്തിലാണ് സം...
3,Fake,Shame for entire Woman&#39
4,Fake,135 code janaghal andhu wide business cheythal...
5,Fake,Why not all countries club together n block ch...
6,original,Ethil appuram നാണക്കേഡ് വന്നിട്ടില്ല cpmne🤣🤣🤣a...
7,original,കൊറോണ പോയി ഒന്ന് കൂടെ മെച്ചപ്പെട്ട് ഓമൈക്രോനായ...
8,original,മോളെ. ഇത് കോമഡി സ്റ്റാർസ് അല്ല. ചിരിച്ചും കളിച...
9,original,Marunaadan kandupidutham


In [43]:
results_without_label_nb.to_csv('output_file_task1.csv', index=False)

In [44]:
df=results_without_label_nb

In [46]:
actual_labels = pd.read_csv('Dataset/Fake_test_with_labels1.csv')

In [49]:
merged_df = pd.merge(df, actual_labels, left_on='Text', right_on='text')

In [50]:
correct_predictions = merged_df['Predicted'] == merged_df['label']
accuracy = correct_predictions.sum() / len(correct_predictions)

print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 78.50%
