In [1]:
from itertools import product
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import label_binarize
from sklearn.neural_network import MLPClassifier
from collections import defaultdict

from statistics import mean


data = pd.read_csv("AI_Human.csv")
data['text'] = data['text'].str.strip()
data['text'] = data['text'].replace('',np.nan)
data = data.dropna()
# remove essays in the dataset that include only whitespace (there were some weird cases)
essays = list(data['text'])
target = list(data['generated'])
# split the data into two lists, one of the pure text and another of the target (whether or not it was AI-generated)

In [7]:
len(essays)
len(target)

487231

In [2]:
features = {}

for symbol in [".","\n",",","?",":",";",'"',"'","-","they","that"]:
    features[symbol] = [essay.count(symbol) for essay in essays]
# count the frequency of particular symbols or words and make them into a feature for the model

av_length = []
for essay in essays:
    words = [len(word) for word in essay.split()]
    av_length.append(mean(words))
# calculate average number of letters per word in each essay and make that a feature as well

features['length'] = av_length

[]

In [3]:
df = pd.DataFrame(features)
# create a dataframe out of the features
df.head()
df = pd.concat([data,df],axis=1)
df = df.drop(columns = ['text'])
# combine it with the original dataframe which has the target column (generated)

In [4]:
X = df.drop(columns = ['generated'])
y = df['generated']

X = X.dropna()
y = y.dropna()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=3141)
model = LogisticRegression()
model.fit(X_train,y_train)
# do a train test split and run a logistic regression model on the dataframe to predict the 'generated' column

0.869118597801882


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
print(classification_report(model.predict(X_test),y_test))
print(confusion_matrix(model.predict(X_test),y_test))
print(model.score(X_test,y_test))
# the model was 87% accurate! mistakes were pretty evenly split, based on the confusion matrix.

              precision    recall  f1-score   support

         0.0       0.91      0.88      0.90     63393
         1.0       0.79      0.85      0.82     34054

    accuracy                           0.87     97447
   macro avg       0.85      0.86      0.86     97447
weighted avg       0.87      0.87      0.87     97447

[[55881  7512]
 [ 5242 28812]]


In [7]:
model = LogisticRegression()
model.fit(X,y)
# retrain the model on the entire dataset instead of just a portion

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [10]:
'''
this segment converts the test essays, which were a txt file, into a dataframe in the same form as the other.
Most of this code is the same as before
'''
with open('ml_essays.txt') as f:
    essays = []
    generated = [0] * 12 + [1] * 11
    # the target column
    for i in range(12):
        essays.append(next(f).strip())
    
    for i in range(11):
        essays.append(next(f).strip())

features = {}

for symbol in [".","\n",",","?",":",";",'"',"'","-","they","that"]:
    features[symbol] = [essay.count(symbol) for essay in essays]

av_length = []
for essay in essays:
    words = [len(word) for word in essay.split()]
    av_length.append(mean(words))

features['length'] = av_length
features['generated'] = generated
        
df2 = pd.DataFrame(features)
df2.head()

Unnamed: 0,.,\n,",",?,:,;,"""",',-,they,that,length,generated
0,17,0,16,0,0,0,0,3,0,0,11,4.619617,0
1,20,0,4,1,0,1,0,10,0,1,3,4.678218,0
2,27,0,27,0,0,0,0,3,1,4,10,4.77918,0
3,16,0,19,0,0,1,0,3,0,1,5,4.762542,0
4,33,0,23,0,0,0,8,6,0,0,6,4.712264,0


In [13]:
X_test = df2.drop(columns = ['generated'])
y_test = df2['generated']

print(classification_report(model.predict(X_test),y_test))
print(confusion_matrix(model.predict(X_test),y_test))
print(model.score(X_test,y_test))

              precision    recall  f1-score   support

         0.0       0.75      0.90      0.82        10
         1.0       0.91      0.77      0.83        13

    accuracy                           0.83        23
   macro avg       0.83      0.83      0.83        23
weighted avg       0.84      0.83      0.83        23

[[ 9  1]
 [ 3 10]]
0.8260869565217391
