In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


##Importing Dataset From kaggle

In [None]:
#!/bin/bash
!kaggle datasets download shanegerami/ai-vs-human-text


Dataset URL: https://www.kaggle.com/datasets/shanegerami/ai-vs-human-text
License(s): other
Downloading ai-vs-human-text.zip to /content
 99% 346M/350M [00:00<00:00, 685MB/s]
100% 350M/350M [00:00<00:00, 734MB/s]


In [None]:
!unzip ai-vs-human-text.zip



Archive:  ai-vs-human-text.zip
  inflating: AI_Human.csv            


##Importing required libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from nltk.stem import PorterStemmer
nltk.download('punkt_tab')
nltk.download('stopwords')




[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
stop_words = set(stopwords.words('english'))

**reading dataset**

In [None]:
df=pd.read_csv("/content/AI_Human.csv")

In [None]:
df.shape

(487235, 2)

In [None]:
df.head()

Unnamed: 0,text,generated
0,Cars. Cars have been around since they became ...,0.0
1,Transportation is a large necessity in most co...,0.0
2,"""America's love affair with it's vehicles seem...",0.0
3,How often do you ride in a car? Do you drive a...,0.0
4,Cars are a wonderful thing. They are perhaps o...,0.0


In [None]:
df['generated'].value_counts()

Unnamed: 0_level_0,count
generated,Unnamed: 1_level_1
0.0,305797
1.0,181438


**Balancing the datset**

In [None]:
df_AI=df[df['generated']==1]
df_Human=df[df['generated']==0]
df_Human_downsampled=df_Human.sample(df_AI.shape[0])
df_balanced=pd.concat([df_AI,df_Human_downsampled])
df_balanced['generated'].value_counts()

Unnamed: 0_level_0,count
generated,Unnamed: 1_level_1
1.0,181438
0.0,181438


In [None]:
df=df_balanced

In [None]:
df.head()

Unnamed: 0,text,generated
704,"This essay will analyze, discuss and prove one...",1.0
740,I strongly believe that the Electoral College ...,1.0
1262,"Limiting car use causes pollution, increases c...",1.0
1378,Car-free cities have become a subject of incre...,1.0
1379,"Car Free Cities Car-free cities, a concept ga...",1.0


In [None]:
df['text'][704]

"This essay will analyze, discuss and prove one reason in favor of keeping the Electoral College in the United States for its presidential elections. One of the reasons to keep the electoral college is that it is better for smaller, more rural states to have more influence as opposed to larger metropolitan areas that have large populations. The electors from these states are granted two votes each. Those from larger, more populated areas are granted just one vote each. Smaller states tend to hold significant power because their two votes for president and vice president add up more than the votes of larger states that have many electors. This is because of the split of the electoral votes. Some argue that electors are not bound to vote for the candidate who won the most votes nationally. They do not have to vote for their own state's nominee unless their state has a winner take all system. However, there are states that have adopted laws that force their electors to vote for their stat

**making the function for preproccessing the data**

In [None]:
import re
stemmer=PorterStemmer()
def text_preproccessor(text):
  text = re.sub(r'[^a-zA-Z\s]', '', text)
  words=word_tokenize(text)
  filtered_words=[word.lower() for word in words if word.lower() not in stop_words]
  filtered_steamed_words=[stemmer.stem(word) for word in filtered_words]

  return ' '.join(filtered_steamed_words)


**Applying the function on text column**

In [None]:
df['text']=df['text'].apply(text_preproccessor)

In [None]:
df.head()

Unnamed: 0,text,generated
704,essay analyz discuss prove one reason favor ke...,1.0
740,strongli believ elector colleg remain way bett...,1.0
1262,limit car use caus pollut increas cost user re...,1.0
1378,carfre citi becom subject increas interest deb...,1.0
1379,car free citi carfre citi concept gain tractio...,1.0


**training and testing the model**

In [None]:
X=df['text']
Y=df['generated']
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42)
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),         # preprocessing
    ('clf', LogisticRegression())         # classifier
])
pipeline.fit(X_train, Y_train)
Y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
print("Accuracy:", accuracy)
print(classification_report(Y_test, Y_pred))

Accuracy: 0.9891561948853616
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     36296
         1.0       0.99      0.99      0.99     36280

    accuracy                           0.99     72576
   macro avg       0.99      0.99      0.99     72576
weighted avg       0.99      0.99      0.99     72576



**dumping the model using joblib lib**

In [None]:
import joblib
joblib.dump(pipeline, 'model.pkl')

['AI_VS_HUMAN_classification_model.pkl']

**testing the model on randome text data**

In [None]:
pipeline.predict(["""Steps happening under the hood: TF-IDF vectorization Converts all text into a sparse matrix. Complexity: roughly O(number of documents × vocabulary size). For 487k rows: TF-IDF fitting can take several minutes on CPU, depending on average text length and vocabulary size. Logistic Regression fitting Default solver: lbfgs (good for small-medium datasets). Complexity: O(n_features × n_samples × iterations). With 487k rows and TF-IDF vectors (say 50k–100k features), training may take 5–15 minutes on a single CPU. You can speed this up with: solver='saga' (supports sparse matrices, faster on large data) max_iter=1000 (increase if convergence warning occurs) Prediction & accuracy Predicting with sparse TF-IDF vectors is usually very fast (<1 min even for half a million rows)."""])

array([1.])