In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from nltk.util import ngrams
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
# Read the CSV file into a DataFrame
input_file = 'Training_Essay_Data.csv'

df = pd.read_csv(input_file)
data = df.sample(n=10000, random_state=42)

In [3]:
# Text preprocessing function
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back into text
    processed_text = ' '.join(tokens)
    return processed_text

# Preprocess text data
data['processed_text'] = data['text'].apply(preprocess_text)

print(data['processed_text'].head())

17004    would agree emersons world important things co...
14459    advice wonderful helpful everyone since people...
28492    think limiting car usage great environment lot...
10134    nobody know face got mars never face could jum...
23657    student studied lot subjects school stage hesh...
Name: processed_text, dtype: object


In [4]:
# Tokenization and vocabulary building
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['processed_text'])

# Get the vocabulary
vocab = vectorizer.get_feature_names_out()

print("Vocabulary size:", len(vocab))

Vocabulary size: 47202


In [5]:
# Convert the BoW matrix to a DataFrame for better visualization
bow_df = pd.DataFrame(X.toarray(), columns=vocab)

print(bow_df.head())

   aa  aaa  aaaeal  aaccording  aactive  aafetoy  aagain  aaid  aake  aaken  \
0   0    0       0           0        0        0       0     0     0      0   
1   0    0       0           0        0        0       0     0     0      0   
2   0    0       0           0        0        0       0     0     0      0   
3   0    0       0           0        0        0       0     0     0      0   
4   0    0       0           0        0        0       0     0     0      0   

   ...  zooming  zoos  zroom  zs  zuckerberg  zuckerburg  zygomatic  \
0  ...        0     0      0   0           0           0          0   
1  ...        0     0      0   0           0           0          0   
2  ...        0     0      0   0           0           0          0   
3  ...        0     0      0   0           0           0          0   
4  ...        0     0      0   0           0           0          0   

   zygomatice  zygosmtic  zzzzs  
0           0          0      0  
1           0          0      

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, data['generated'], test_size=0.2, random_state=42)

In [7]:
print("Shape of X_train:", X_train.shape)

Shape of X_train: (8000, 47202)


In [8]:
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [9]:
y_pred = model.predict(X_test)

In [10]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.988


In [11]:
from sklearn.svm import SVC

In [12]:
svm_model = SVC(kernel='linear') 
svm_model.fit(X_train, y_train)

SVC(kernel='linear')

In [13]:
y_pred_SVM = svm_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_SVM))

Accuracy: 0.985


In [15]:
from sklearn.metrics import classification_report

In [16]:
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      1198
           1       0.98      0.99      0.99       802

    accuracy                           0.99      2000
   macro avg       0.99      0.99      0.99      2000
weighted avg       0.99      0.99      0.99      2000



In [17]:
input_text = input("Enter the text to predict: ")

Enter the text to predict: 
  @media print {
    .ms-editor-squiggles-container {
      display:none !important;
    }
  }
  .ms-editor-squiggles-container {
    all: initial;
  }America's love of cars may soon be spiraling down. With America's car culture seemingly coming to an end, there will be more alternative ways to get to work, school, shopping districts, and etc. As the years come and go by quickly, Americans are buying less cars and obtaining fewer licenses for themselves. The advantages we can recieve by limiting our car usage is that it takes away stress, lowers air pollution, and benefits daily businesses.  First, Limiting car usage takes away stress. As businessman Carlos Arturo Plaza states:"It's a good opportunity to take away stress..."Â  People who no longer own a car will not have to worry as much about their car. Such as the price of gas rising, car payments, and insurance payments. These type of payments every month often put a big dent in a person's wallet. If we t

In [18]:
# Vectorize the input text
input_vec = vectorizer.transform([input_text])

In [19]:
# Make prediction
predicted_label = model.predict(input_vec)
predicted_label

array([0], dtype=int64)