# TweepFake - Twitter deep Fake text Dataset

In [90]:
import pandas as pd

# Read datasets

In [91]:
df1 = pd.read_csv("D://Dataset//Twiter//archive (4)\\test.csv")
df2 = pd.read_csv("D://Dataset//Twiter//archive (4)\\train.csv")
df3 = pd.read_csv("D://Dataset//Twiter//archive (4)\\validation.csv")

In [92]:
# concatenate the datasets into a single DataFrame
df = pd.concat([df1,df2,df3],ignore_index=True)

In [93]:
# Display basic information about the concatenate dataset
print("Concatenated Dataset info")
print(df.info())

Concatenated Dataset info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25572 entries, 0 to 25571
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   user_id       25572 non-null  object
 1   status_id     25572 non-null  object
 2   screen_name   25572 non-null  object
 3   account.type  25572 non-null  object
 4   class_type    25572 non-null  object
dtypes: object(5)
memory usage: 999.0+ KB
None


In [94]:
# Display summary statistics  of the concatenated dataset
print("\nSummary Statistics")
print(df.describe)


Summary Statistics
<bound method NDFrame.describe of                    user_id            status_id screen_name account.type  \
0               3171109449  1123915745178656769    human#17        human   
1                 18839785  1173906284195852290    human#11        human   
2                343587159  1197343799846027265     human#1        human   
3      1197916267975335939  1208274159274475521      bot#12          bot   
4                 15088390  1084181032927059970    human#10        human   
...                    ...                  ...         ...          ...   
25567   705113652471439361   714523361305608192      bot#16          bot   
25568            262794965   935057601103933441     human#8        human   
25569            343587159  1158520796039405569     human#1        human   
25570  1110407881030017024  1210364706457677824       bot#9          bot   
25571  1213988022728810496  1219363974657052675       bot#8          bot   

      class_type  
0          hum

In [95]:
# Check for missing values in concatenated dataset
print("\nMissing Vlaues")
print(df.isnull().sum())


Missing Vlaues
user_id         0
status_id       0
screen_name     0
account.type    0
class_type      0
dtype: int64


# Perform data preprocessing

In [96]:
# Example : Drop duplicates and handle missing values
df = df.drop_duplicates()
df = df.dropna()

In [97]:
# split the data into features X and target (y)
X = df ['screen_name']
y = df ['class_type']

# Split the data into training and testing sets

In [98]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [99]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Vectorize the text data using TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

# Vectorize the text data using CountVectorizer

In [100]:
# Vectorizer the test data using CountVectorizer
vectroizer  = CountVectorizer()
X_train_vectorized = vectroizer.fit_transform(X_train)
X_test_vectorized = vectroizer.fit_transform(X_test)

# Evaluate Naive Bayes model

In [101]:
# Train a basic text classification model (Naive Bayes)
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train_vectorized,y_train)

MultinomialNB()

In [102]:
# Predict the labels for the test set
y_pred = classifier.predict(X_test_vectorized)

In [103]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel accuracy :{accuracy:.2%}")


Model accuracy :90.09%


In [104]:
# Display classification report and confusion matrix
print("\n Classification Report")
print(classification_report(y_test,y_pred))


 Classification Report
              precision    recall  f1-score   support

        gpt2       1.00      0.37      0.54       785
       human       1.00      1.00      1.00      2549
      others       0.66      1.00      0.80       984
         rnn       1.00      0.98      0.99       796

    accuracy                           0.90      5114
   macro avg       0.91      0.84      0.83      5114
weighted avg       0.93      0.90      0.89      5114



In [105]:
print("\n Confusion Matrix :")
print(confusion_matrix(y_test,y_pred))


 Confusion Matrix :
[[ 294    0  491    0]
 [   0 2549    0    0]
 [   0    0  984    0]
 [   0    0   16  780]]


# Evaluate Logistic Regression model

In [106]:
from sklearn.linear_model import LogisticRegression
# Train a Logistic Regression classifier
lr_classifier = LogisticRegression(max_iter=1000)
lr_classifier.fit(X_train_vectorized, y_train)

LogisticRegression(max_iter=1000)

In [107]:
lr_pred = lr_classifier.predict(X_test_vectorized)
lr_accuracy = accuracy_score(y_test,lr_pred)

In [108]:
print("\n Logistic Regression Model:")
print(f"Accuracy:{lr_accuracy:.2%}")


 Logistic Regression Model:
Accuracy:90.85%


In [109]:
print("\nclassification_report")
print(classification_report(y_test,lr_pred))


classification_report
              precision    recall  f1-score   support

        gpt2       0.63      1.00      0.77       785
       human       1.00      1.00      1.00      2549
      others       1.00      0.54      0.70       984
         rnn       1.00      0.98      0.99       796

    accuracy                           0.91      5114
   macro avg       0.91      0.88      0.87      5114
weighted avg       0.94      0.91      0.91      5114



In [110]:
print("\nconfusion_matrix")
print(confusion_matrix(y_test,lr_pred))


confusion_matrix
[[ 785    0    0    0]
 [   0 2549    0    0]
 [ 452    0  532    0]
 [  16    0    0  780]]


# Evaluate Random Forest model

In [111]:
from sklearn.ensemble import RandomForestClassifier
# Train a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_vectorized,y_train)
rf_pred = rf_classifier.predict(X_test_vectorized)
rf_accuracy = accuracy_score(y_test,rf_pred)

In [112]:
print("\n Random Forest Model")
print(f"accuracy:{rf_accuracy:.2%}")


 Random Forest Model
accuracy:90.85%


In [113]:
print(classification_report(y_test,rf_pred))

              precision    recall  f1-score   support

        gpt2       0.63      1.00      0.77       785
       human       1.00      1.00      1.00      2549
      others       1.00      0.54      0.70       984
         rnn       1.00      0.98      0.99       796

    accuracy                           0.91      5114
   macro avg       0.91      0.88      0.87      5114
weighted avg       0.94      0.91      0.91      5114



In [114]:
print(confusion_matrix(y_test,rf_pred))

[[ 785    0    0    0]
 [   0 2549    0    0]
 [ 452    0  532    0]
 [  16    0    0  780]]


# Support Vector Machine 

In [115]:
from sklearn.svm import SVC
# Train a Support Vector Machine classifier
svm_classifier = SVC(kernel = 'linear', C=1.0)
svm_classifier.fit(X_train_vectorized, y_train)


SVC(kernel='linear')

# Evaluate the SVM model

In [117]:
svm_pred = svm_classifier.predict(X_test_vectorized)
svm_accuracy = accuracy_score(y_test, svm_pred)

In [120]:
# Display evaluation matrices for SVM
print("/n Support Vector Machine")
print(f"Accuracy: {svm_accuracy:.2%}")

/n Support Vector Machine
Accuracy: 90.85%


In [122]:
print("/nClassification Report")
print(classification_report(y_test,svm_pred))

/nClassification Report
              precision    recall  f1-score   support

        gpt2       0.63      1.00      0.77       785
       human       1.00      1.00      1.00      2549
      others       1.00      0.54      0.70       984
         rnn       1.00      0.98      0.99       796

    accuracy                           0.91      5114
   macro avg       0.91      0.88      0.87      5114
weighted avg       0.94      0.91      0.91      5114



In [123]:
print(confusion_matrix(y_test,svm_pred))

[[ 785    0    0    0]
 [   0 2549    0    0]
 [ 452    0  532    0]
 [  16    0    0  780]]


# Gradient Boosting classifier

In [125]:
from sklearn.ensemble import GradientBoostingClassifier
# Train a Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(n_estimators=100,random_state=42)
gb_classifier.fit(X_train_vectorized, y_train)

GradientBoostingClassifier(random_state=42)

In [126]:
# Evaluate the Gradient Boosting model
gb_pred = gb_classifier.predict(X_test_vectorized)
gb_accuracy = accuracy_score(y_test,gb_pred)

In [127]:
# Display evaluation metrics for Gradient Boosting
print("\nGradient Boosting Model")
print(f"Accuracy:{gb_accuracy:.2%}")


Gradient Boosting Model
Accuracy:90.85%


In [129]:
print(classification_report(y_test,gb_pred))

              precision    recall  f1-score   support

        gpt2       0.63      1.00      0.77       785
       human       1.00      1.00      1.00      2549
      others       1.00      0.54      0.70       984
         rnn       1.00      0.98      0.99       796

    accuracy                           0.91      5114
   macro avg       0.91      0.88      0.87      5114
weighted avg       0.94      0.91      0.91      5114



In [130]:
print(confusion_matrix(y_test,gb_pred))

[[ 785    0    0    0]
 [   0 2549    0    0]
 [ 452    0  532    0]
 [  16    0    0  780]]
