<a href="https://colab.research.google.com/github/Sharanya-Parimanoharan/AI-Generated-Text-Detection/blob/main/LogisticRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import nltk
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import  balanced_accuracy_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.metrics import roc_curve

import matplotlib.pyplot as plt
import joblib
from nltk import pos_tag
import string
from sklearn.preprocessing import StandardScaler
import torch.nn as nn
import torch.optim as optim
import torch

In [None]:
# ... (rest of your code for data loading and preprocessing)
df = pd.read_csv("drive/MyDrive/data_2.csv")


In [None]:
# Data preprocessing

# Remove punctuation
def remove_punctuation(text):
    return ''.join([char for char in text if char not in string.punctuation])

df['text'] = df['text'].apply(remove_punctuation)

#Handle missing values
missing_values = df.isnull().sum()
df = df.dropna()
df['text'].fillna("NA", inplace=True)
missing_values = df.isnull().sum()



print(df.tail())

print(df.shape)

print(df.isnull().sum())

print(df['label'].value_counts())

df["label_int"]= np.where(df["label"]=='"AI"', 1, 0)

print(df.head)

print(df['label_int'].value_counts())


In [None]:
print(df['label_int'].describe())


In [None]:

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label_int"], random_state=0)

# Feature extraction (bag-of-words and TF-IDF)
vect = CountVectorizer(min_df=5, ngram_range=(1, 2)).fit(X_train)
X_train_vectorized = vect.transform(X_train)
X_test_vectorized = vect.transform(X_test)


tfidf_vect = TfidfVectorizer(min_df=5, ngram_range=(1, 2)).fit(X_train)
X_train_tfidf = tfidf_vect.transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
# Feature extraction for POS tags

def preprocess_text(text):
    # Tokenize the text into words
    words = nltk.word_tokenize(text)

    # Perform POS tagging
    pos_tags = nltk.pos_tag(words)

    # Extract the POS tags and concatenate them into a single string
    pos_tags_str = ' '.join([tag for _, tag in pos_tags])

    return pos_tags_str

# Apply the preprocess_text function to your text data
X_train_pos = [preprocess_text(text) for text in X_train]
X_test_pos = [preprocess_text(text) for text in X_test]



# Create a CountVectorizer for POS tags
pos_vectorizer = CountVectorizer()

# Fit and transform on training data
X_train_pos_vec = pos_vectorizer.fit_transform(X_train_pos)

# Transform test data
X_test_pos_vec = pos_vectorizer.transform(X_test_pos)


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Assuming you have labels (y_train and y_test)

# Extract indices for 'AI' and 'Human' labels in the training set
indices_ai_train = [i for i, label in enumerate(y_train) if label == 1]
indices_human_train = [i for i, label in enumerate(y_train) if label == 0]

# Extract indices for 'AI' and 'Human' labels in the test set
indices_ai_test = [i for i, label in enumerate(y_test) if label == 1]
indices_human_test = [i for i, label in enumerate(y_test) if label == 0]

# Plot POS tag distribution for 'AI' and 'Human' in the training set
counts_ai_train = np.array(X_train_pos_vec[indices_ai_train].sum(axis=0))[0]
counts_human_train = np.array(X_train_pos_vec[indices_human_train].sum(axis=0))[0]

bar_width = 0.35
r1 = np.arange(len(pos_tags_train))
r2 = [x + bar_width for x in r1]

plt.figure(figsize=(15, 6))
plt.bar(r1, counts_ai_train, color='blue', width=bar_width, edgecolor='grey', label='AI - Train')
plt.bar(r2, counts_human_train, color='orange', width=bar_width, edgecolor='grey', label='Human - Train')

plt.title('POS Tag Distribution - Training Set')
plt.xlabel('POS Tags')
plt.ylabel('Frequency')
plt.xticks([r + bar_width/2 for r in range(len(pos_tags_train))], pos_tags_train, rotation=45, ha='right')
plt.legend()
plt.show()

# Plot POS tag distribution for 'AI' and 'Human' in the test set
counts_ai_test = np.array(X_test_pos_vec[indices_ai_test].sum(axis=0))[0]
counts_human_test = np.array(X_test_pos_vec[indices_human_test].sum(axis=0))[0]

plt.figure(figsize=(15, 6))
plt.bar(r1, counts_ai_test, color='blue', width=bar_width, edgecolor='grey', label='AI - Test')
plt.bar(r2, counts_human_test, color='orange', width=bar_width, edgecolor='grey', label='Human - Test')

plt.title('POS Tag Distribution - Test Set')
plt.xlabel('POS Tags')
plt.ylabel('Frequency')
plt.xticks([r + bar_width/2 for r in range(len(pos_tags_test))], pos_tags_test, rotation=45, ha='right')
plt.legend()
plt.show()



In [None]:
import matplotlib.pyplot as plt


# Extract indices for 'AI' and 'Human' labels in the training set
indices_ai_train = [i for i, label in enumerate(y_train) if label == 1]
indices_human_train = [i for i, label in enumerate(y_train) if label == 0]

# Extract indices for 'AI' and 'Human' labels in the test set
indices_ai_test = [i for i, label in enumerate(y_test) if label == 1]
indices_human_test = [i for i, label in enumerate(y_test) if label == 0]

# Plot average TF-IDF values for 'AI' and 'Human' in the training set
tfidf_ai_train = X_train_tfidf[indices_ai_train].mean(axis=0).A[0]
tfidf_human_train = X_train_tfidf[indices_human_train].mean(axis=0).A[0]

bar_width = 0.35
r1 = np.arange(X_train_tfidf.shape[1])
r2 = [x + bar_width for x in r1]

plt.figure(figsize=(15, 6))
plt.bar(r1, tfidf_ai_train, color='blue', width=bar_width, edgecolor='grey', label='AI - Train - TF-IDF')
plt.bar(r2, tfidf_human_train, color='orange', width=bar_width, edgecolor='grey', label='Human - Train - TF-IDF')

plt.title('TF-IDF Analysis - Training Set')
plt.xlabel('TF-IDF Features')
plt.ylabel('Average TF-IDF Value')
plt.xticks([r + bar_width/2 for r in range(X_train_tfidf.shape[1])], tfidf_vectorizer.get_feature_names_out(), rotation=45, ha='right')
plt.legend()
plt.show()

# Repeat the process for the test set
tfidf_ai_test = X_test_tfidf[indices_ai_test].mean(axis=0).A[0]
tfidf_human_test = X_test_tfidf[indices_human_test].mean(axis=0).A[0]

plt.figure(figsize=(15, 6))
plt.bar(r1, tfidf_ai_test, color='blue', width=bar_width, edgecolor='grey', label='AI - Test - TF-IDF')
plt.bar(r2, tfidf_human_test, color='orange', width=bar_width, edgecolor='grey', label='Human - Test - TF-IDF')

plt.title('TF-IDF Analysis - Test Set')
plt.xlabel('TF-IDF Features')
plt.ylabel('Average TF-IDF Value')
plt.xticks([r + bar_width/2 for r in range(X_test_tfidf.shape[1])], tfidf_vectorizer.get_feature_names_out(), rotation=45, ha='right')
plt.legend()
plt.show()




In [None]:

# Concatenate features
X_train_combined = np.hstack((X_train_vectorized.toarray(), X_train_tfidf.toarray(), X_train_pos_vec.toarray()))
X_test_combined = np.hstack((X_test_vectorized.toarray(), X_test_tfidf.toarray(), X_test_pos_vec.toarray()))

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_combined)
X_test_scaled = scaler.fit_transform(X_test_combined)


In [None]:

# Define a simple logistic regression model
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_size):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_size, 1)

    def forward(self, x):
        return torch.sigmoid(self.linear(x))

In [None]:
# Convert your data to PyTorch tensors if not done already
X_train_tensor = torch.Tensor(X_train_scaled)
y_train_tensor = torch.Tensor(y_train.values)

X_val_tensor = torch.Tensor(X_val_scaled)
y_val_tensor = torch.Tensor(y_val)

# Initialize the model, loss function, and optimizer
input_size = X_train_tensor.shape[1]
model = LogisticRegressionModel(input_size)
criterion = nn.BCELoss()  # Binary Cross Entropy Loss for binary classification
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [None]:
num_epochs = 10
train_loss_values = []
val_loss_values = []

for epoch in range(num_epochs):
    # Training
    model.train()
    optimizer.zero_grad()

    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor.view(-1, 1))
    loss.backward()
    optimizer.step()
    train_loss_values.append(loss.item())

    # Validation
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val_tensor)
        val_loss = criterion(val_outputs, y_val_tensor.view(-1, 1))
        val_loss_values.append(val_loss.item())

    # Print training and validation loss at the end of each epoch
    print(f'Epoch [{epoch + 1}/{num_epochs}], Train Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}')

# After the training loop, you can plot the loss values, make predictions, and evaluate the model further.

In [None]:
# plot training loss
plt.figure(figsize=(10, 5))
plt.plot(train_loss_values, label='Training Loss')
plt.plot(val_loss_values, label='Validation Loss')

plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Over Epochs')
plt.legend()
plt.show()

In [None]:


# Convert your test data to PyTorch tensor if not done already
X_test_tensor = torch.Tensor(X_test_scaled)

# Make predictions
with torch.no_grad():
    predictions = model(X_test_tensor)

binary_predictions = (predictions >= 0.5).int().squeeze().numpy()

print(binary_predictions)
print(y_test)
report = classification_report(y_test, binary_predictions)


print("AUC score is", roc_auc_score(y_test, binary_predictions))
print("accuracy is ",balanced_accuracy_score(y_test,binary_predictions))
print("Classification Report:\n", report)



In [None]:


     # Calculate ROC curve
pr, tpr, thresholds = roc_curve(y_test, binary_predictions)

    # Calculate AUC
auc_value = roc_auc_score(y_test, binary_predictions)

        # Plot ROC curve
plt.figure()
plt.plot(pr, tpr, color='darkorange', lw=2, label=f'AUC = {auc_value:.2f}')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Curve')
plt.legend(loc='lower right')
plt.show()




In [None]:


# Save the trained model to a file
model_filename = "drive/MyDrive/logistic_regression_model.joblib"
joblib.dump(model, model_filename)# Load the saved model



In [None]:
import joblib

model_filename = "drive/MyDrive/logistic_regression_model.joblib"

loaded_model = joblib.load(model_filename)


# Load test data
test_df = pd.read_csv("drive/MyDrive/test.csv")

# Apply the same preprocessing steps
test_df['text'] = test_df['text'].apply(remove_punctuation)
test_df['text'].fillna("NA", inplace=True)

test_df["label_int"]= np.where(test_df["label"]=='"AI"', 1, 0)

# Add any additional preprocessing steps if needed
# Feature extraction (bag-of-words) on test data
X_test_counts = vect.transform(test_df["text"])

# Feature extraction using TF-IDF on test data
X_test_tfidf = tfidf_vect.transform(test_df["text"])

# POS tagging and vectorization on test data
X_test_pos = [preprocess_text(text) for text in test_df["text"]]
X_test_pos_vec = pos_vectorizer.transform(X_test_pos)

# Concatenate features for test data
X_test_combined = np.hstack((X_test_counts.toarray(), X_test_tfidf.toarray(), X_test_pos_vec.toarray()))

# Scale the features for test data
X_test_scaled = scaler.transform(X_test_combined)


X_test_tensor = torch.Tensor(X_test_scaled)



predictions=[]


# Make predictions
with torch.no_grad():
    predictions = loaded_model(X_test_tensor)
##
binary_predictions = (predictions >= 0.5).int().squeeze().numpy()
##
print(binary_predictions)
print(test_df["label_int"])
report = classification_report(test_df['label_int'], binary_predictions)
##
##
print("AUC score is", roc_auc_score(test_df['label_int'], binary_predictions))
print("accuracy is ",balanced_accuracy_score(test_df['label_int'],binary_predictions))
print("Classification Report:\n", report)




In [None]:
# Print some text, true labels, and predictions
for i in range(min(10, len(X_test_tensor))):  # Print the first 5 samples
    print(f"Text: {test_df['text'].iloc[i]}")
    print(f"True Label: {test_df['label'].iloc[i]}, Predicted Label: {binary_predictions[i]}")
    print("="*50)

In [None]:
import joblib

model_filename = "drive/MyDrive/logistic_regression_model.joblib"

loaded_model = joblib.load(model_filename)


# Load test data
test_df = pd.read_csv("drive/MyDrive/test_new.csv")

# Apply the same preprocessing steps
test_df['text'] = test_df['text'].apply(remove_punctuation)
test_df['text'].fillna("NA", inplace=True)

test_df["label_int"]= np.where(test_df["label"]=='"AI"', 1, 0)

# Add any additional preprocessing steps if needed
# Feature extraction (bag-of-words) on test data
X_test_counts = vect.transform(test_df["text"])

# Feature extraction using TF-IDF on test data
X_test_tfidf = tfidf_vect.transform(test_df["text"])

# POS tagging and vectorization on test data
X_test_pos = [preprocess_text(text) for text in test_df["text"]]
X_test_pos_vec = pos_vectorizer.transform(X_test_pos)

# Concatenate features for test data
X_test_combined = np.hstack((X_test_counts.toarray(), X_test_tfidf.toarray(), X_test_pos_vec.toarray()))

# Scale the features for test data
X_test_scaled = scaler.transform(X_test_combined)


X_test_tensor = torch.Tensor(X_test_scaled)



predictions=[]


# Make predictions
with torch.no_grad():
    predictions = loaded_model(X_test_tensor)
##
binary_predictions = (predictions >= 0.5).int().squeeze().numpy()
np.savetxt('/content/drive/MyDrive/pred_LR.csv', binary_predictions, delimiter=',',  fmt='%.2f')

#binary_predictions.save('/content/drive/MyDrive/predictions_LR.csv')

##
print(binary_predictions)
print(test_df["label_int"])
report = classification_report(test_df['label_int'], binary_predictions)
##
##
print("AUC score is", roc_auc_score(test_df['label_int'], binary_predictions))
print("accuracy is ",balanced_accuracy_score(test_df['label_int'],binary_predictions))
precision_ai = precision_score(test_df['label_int'], binary_predictions, pos_label=1)  # Assuming "AI" is the positive class
precision_human = precision_score(test_df['label_int'], binary_predictions, pos_label=0)  # Assuming "AI" is the positive class

recall_ai = recall_score(test_df['label_int'], binary_predictions, pos_label=1)  # Assuming "AI" is the positive class
recall_human = recall_score(test_df['label_int'], binary_predictions, pos_label=0)  # Assuming "AI" is the positive class

f1_ai = f1_score(test_df['label_int'], binary_predictions, pos_label=1)  # Assuming "AI" is the positive class
f1_human = f1_score(test_df['label_int'], binary_predictions, pos_label=0)  # Assuming "AI" is the positive class
conf_matrix = confusion_matrix(test_df['label_int'], binary_predictions)

print("Classification Report:\n", report)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Sample data (replace with your actual values)
ai_values = [precision_ai,recall_ai,f1_ai]
human_values = [precision_human,recall_human,f1_human]

# Metrics names
metrics = [ 'Precision', 'Recall', 'F1-Score']

# Number of metrics
num_metrics = len(metrics)

# Create an array of indices for each metric
indices = np.arange(num_metrics)

# Bar width
bar_width = 0.1

# Plotting
fig, ax = plt.subplots(figsize=(6, 4))

# Plot AI values
ax.bar(indices, ai_values, bar_width, label='AI', color='blue',alpha=0.5 )

# Plot Human values
ax.bar(indices + bar_width, human_values, bar_width, label='Human', color='orange')

# Customize the plot
# Make the y-axis tick labels darker and bigger
ax.tick_params(axis='both', labelsize=15, labelcolor='black')

# Make the axis labels (x and y) bold
ax.xaxis.label.set_fontweight('bold')
ax.yaxis.label.set_fontweight('bold')
ax.set_xticks(indices + bar_width / 2)


# Make the ticks on both axes thicker and darker
#ax.tick_params(axis='both', which='both', width=4, colors='black')


ax.set_xticklabels(metrics)
ax.legend(fontsize=6,loc="best")
ax.set_ylabel('Metric Values')
#ax.set_title('Comparison of AI and Human Performance Metrics')

# Show the plot
plt.show()

In [None]:
# Extract values from confusion matrix
tn, fp, fn, tp = conf_matrix.ravel()

# Plotting
labels = ['True Negative', 'False Positive', 'False Negative', 'True Positive']
values = [tn, fp, fn, tp]

fig, ax = plt.subplots()
bars = ax.bar(labels, values, color=['pink', 'yellow', 'orange', 'blue'],alpha=0.8,width=0.3)

# Add labels and title
ax.set_ylabel('Count')
ax.set_title('Confusion Matrix')

# Add value annotations on top of the bars
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, round(yval, 2), ha='center', va='bottom')
ax.tick_params(axis='y', labelsize=20, labelcolor='black')
ax.set_xticklabels(labels, fontweight='bold')


plt.show()