In [15]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
import pandas as pd
df_combined_lemmatized = pd.read_csv('/content/drive/My Drive/Human vs AI Generated Text Classification/df_combined_lemmatized.csv')

In [17]:
df_combined_lemmatized

Unnamed: 0,label,length_chars,length_words,punctuation_ratio,repetition_score,text
0,0,126,22,0.031746,0.045455,"as someone who follow food , I believe privacy..."
1,1,141,19,0.014184,0.000000,analysis indicate that travel be associate wit...
2,0,112,17,0.026786,0.000000,I recently experience education in my day to d...
3,1,101,14,0.009901,0.000000,analysis indicate that sport be associate with...
4,0,78,14,0.025641,0.000000,"in my experience , finance often lead to user ..."
...,...,...,...,...,...,...
20872,1,2287,352,0.010494,0.281250,the impact of technology on art be an interest...
20873,1,943,147,0.015907,0.299320,distance learning have be rapidly grow in popu...
20874,0,2406,434,0.012053,0.767281,people do not have own decision because they l...
20875,0,2406,434,0.018703,0.559908,how would you feel if your principle tell you ...


In [18]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
    print("NLTK 'stopwords' corpus downloaded.")

try:
    nltk.download('punkt')
    print("NLTK 'punkt' tokenizer downloaded/ensured.")
except Exception as e:
    print(f"Error downloading 'punkt': {e}")

try:
    nltk.data.find('tokenizers/punkt_tab/english/')
    print("NLTK 'punkt_tab' resource already available.")
except LookupError:
    nltk.download('punkt_tab')
    print("NLTK 'punkt_tab' resource downloaded.")

print("NLTK modules imported and stopwords/punkt corpora checked/downloaded.")

NLTK 'punkt' tokenizer downloaded/ensured.
NLTK 'punkt_tab' resource already available.
NLTK modules imported and stopwords/punkt corpora checked/downloaded.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [19]:
english_stopwords = set(stopwords.words('english'))

def remove_stopwords_from_text(text):
    if isinstance(text, str):
        tokens = word_tokenize(text)
        filtered_tokens = [word for word in tokens if word.lower() not in english_stopwords]
        return ' '.join(filtered_tokens)
    return text

df_combined_lemmatized['cleaned_text'] = df_combined_lemmatized['text'].apply(remove_stopwords_from_text)
print("Stopwords removed from 'text' column and stored in 'cleaned_text'.")
print(df_combined_lemmatized[['text', 'cleaned_text']].head())

Stopwords removed from 'text' column and stored in 'cleaned_text'.
                                                text  \
0  as someone who follow food , I believe privacy...   
1  analysis indicate that travel be associate wit...   
2  I recently experience education in my day to d...   
3  analysis indicate that sport be associate with...   
4  in my experience , finance often lead to user ...   

                                        cleaned_text  
0  someone follow food , believe privacy concern ...  
1  analysis indicate travel associate privacy con...  
2  recently experience education day day life fin...  
3  analysis indicate sport associate cost vary gr...  
4  experience , finance often lead user report mi...  


In [20]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_bow = CountVectorizer()
df_combined_lemmatized['cleaned_text'] = df_combined_lemmatized['cleaned_text'].fillna('')
bow_features = vectorizer_bow.fit_transform(df_combined_lemmatized['cleaned_text'])

print(f"Shape of BoW features: {bow_features.shape}")

Shape of BoW features: (20877, 38038)


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_tfidf = TfidfVectorizer()
tfidf_features = vectorizer_tfidf.fit_transform(df_combined_lemmatized['cleaned_text'])

print(f"Shape of TF-IDF features: {tfidf_features.shape}")

Shape of TF-IDF features: (20877, 38038)


In [22]:
import sys
!{sys.executable} -m pip install gensim
print("gensim library installed.")

gensim library installed.


In [23]:
from gensim.models import Word2Vec

sentences = [text.split() for text in df_combined_lemmatized['cleaned_text'].astype(str)]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=5, workers=4)

print("Word2Vec model trained.")
print(f"Number of unique words (vocabulary size): {len(word2vec_model.wv)}")
print(f"Word embedding dimension: {word2vec_model.vector_size}")

Word2Vec model trained.
Number of unique words (vocabulary size): 12664
Word embedding dimension: 100


## Summary of Text Feature Engineering and Complementary Numerical Features

### Results of Text-Based Feature Engineering:

*   **Bag of Words (BoW) Features**: Generated using `CountVectorizer` on the `cleaned_text` column.
    *   **Shape**: `bow_features.shape` is (20877, 38038).
    *   **Description**: This matrix represents the frequency of each word in the vocabulary across all documents. Each row corresponds to a document, and each column corresponds to a unique word. The values indicate how many times a word appears in a document.

*   **TF-IDF Features**: Generated using `TfidfVectorizer` on the `cleaned_text` column.
    *   **Shape**: `tfidf_features.shape` is (20877, 38038).
    *   **Description**: This matrix contains the Term Frequency-Inverse Document Frequency scores for each word in each document. TF-IDF gives more weight to words that are frequent in a document but rare across all documents, helping to highlight important words that are specific to certain texts.

*   **Word2Vec Model**: Trained on the `cleaned_text` column.
    *   **Vocabulary Size**: The model learned embeddings for 12,664 unique words.
    *   **Embedding Dimension**: Each word is represented by a 100-dimensional vector.
    *   **Description**: Word2Vec captures semantic relationships between words by representing them as dense vectors in a continuous vector space. Words with similar meanings are located closer to each other in this space. These embeddings can be aggregated (e.g., averaged) per document to create document-level features.



**Utility in Enhancing Overall Model Learning**: By combining these stylometric features with content-based features (BoW, TF-IDF, Word2Vec), a machine learning model can gain a more comprehensive understanding of the text. The numerical features provide a high-level overview of the text's structure and readability, while the text-based features delve into its semantic content. This multi-faceted approach allows the model to learn from both _what_ is being said and _how_ it is being said, leading to more robust and accurate classification, especially for distinguishing between human and AI-generated text.

In [24]:
display(df_combined_lemmatized.head())

Unnamed: 0,label,length_chars,length_words,punctuation_ratio,repetition_score,text,cleaned_text
0,0,126,22,0.031746,0.045455,"as someone who follow food , I believe privacy...","someone follow food , believe privacy concern ..."
1,1,141,19,0.014184,0.0,analysis indicate that travel be associate wit...,analysis indicate travel associate privacy con...
2,0,112,17,0.026786,0.0,I recently experience education in my day to d...,recently experience education day day life fin...
3,1,101,14,0.009901,0.0,analysis indicate that sport be associate with...,analysis indicate sport associate cost vary gr...
4,0,78,14,0.025641,0.0,"in my experience , finance often lead to user ...","experience , finance often lead user report mi..."


In [25]:
df_combined_lemmatized_feature_engineered = df_combined_lemmatized.copy()
del df_combined_lemmatized

print("DataFrame renamed to 'df_combined_lemmatized_feature_engineered'.")
print(f"New DataFrame shape: {df_combined_lemmatized_feature_engineered.shape}")

DataFrame renamed to 'df_combined_lemmatized_feature_engineered'.
New DataFrame shape: (20877, 7)


In [26]:
word2vec_model.save('/content/drive/My Drive/Human vs AI Generated Text Classification/word2vec_model.bin')
print("Word2Vec model saved to Google Drive.")

Word2Vec model saved to Google Drive.


In [27]:
import joblib

joblib.dump(vectorizer_bow, '/content/drive/My Drive/Human vs AI Generated Text Classification/count_vectorizer.joblib')
joblib.dump(vectorizer_tfidf, '/content/drive/My Drive/Human vs AI Generated Text Classification/tfidf_vectorizer.joblib')

print("CountVectorizer and TfidfVectorizer saved to Google Drive.")

CountVectorizer and TfidfVectorizer saved to Google Drive.


In [28]:
df_combined_lemmatized_feature_engineered.to_csv('/content/drive/My Drive/Human vs AI Generated Text Classification/df_combined_lemmatized_feature_engineered.csv', index=False)
print("Feature-engineered DataFrame saved to Google Drive.")

Feature-engineered DataFrame saved to Google Drive.
