<a href="https://colab.research.google.com/github/SteevAbrahamThomas/new-test/blob/main/NLP_Classification_Donor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Data Exploration & Preparation

In [17]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras import layers, Model, Input

In [18]:
df = pd.read_csv("donor.csv")
print("Shape:", df.shape)
df.head()

Shape: (109248, 14)


Unnamed: 0,id,teacher_prefix,school_state,project_grade_category,project_subject_categories,project_subject_subcategories,teacher_number_of_previously_posted_projects,project_is_approved,price,quantity,cleaned_titles,cleaned_essays,cleaned_summary,isdigit_summary
0,p253737,mrs,in,grades_prek_2,literacy_language,esl_literacy,0,0,154.6,23,educational support english learners home,students english learners working english seco...,students_need_opportunities_practice_beginning...,0
1,p258326,mr,fl,grades_6_8,history_civics_health_sports,civics_government_teamsports,7,1,299.0,1,wanted projector hungry learners,students arrive school eager learn polite gene...,students_need_projector_help_viewing_education...,0
2,p182444,ms,az,grades_6_8,health_sports,health_wellness_teamsports,1,0,516.85,22,soccer equipment awesome middle school students,true champions not always ones win guts mia ha...,students_need_shine_guards_athletic_socks_socc...,0
3,p246581,mrs,ky,grades_prek_2,literacy_language_math_science,literacy_mathematics,4,1,232.9,4,techie kindergarteners,work unique school filled esl english second l...,students_need_engage_reading_math_way_inspire_...,0
4,p104768,mrs,tx,grades_prek_2,math_science,mathematics,1,1,67.98,4,interactive math tools,second grade classroom next year made around 2...,students_need_hands_practice_mathematics_fun_p...,0


In [19]:
print(df.info())
print(df.describe(include='all'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109248 entries, 0 to 109247
Data columns (total 14 columns):
 #   Column                                        Non-Null Count   Dtype  
---  ------                                        --------------   -----  
 0   id                                            109248 non-null  object 
 1   teacher_prefix                                109248 non-null  object 
 2   school_state                                  109248 non-null  object 
 3   project_grade_category                        109248 non-null  object 
 4   project_subject_categories                    109248 non-null  object 
 5   project_subject_subcategories                 109248 non-null  object 
 6   teacher_number_of_previously_posted_projects  109248 non-null  int64  
 7   project_is_approved                           109248 non-null  int64  
 8   price                                         109248 non-null  float64
 9   quantity                                      10

## 2. NLP processing Pipeline

In [20]:
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"[^a-zA-Z]", " ", text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

df['clean_text'] = df['cleaned_essays'].astype(str).apply(clean_text)
df[['cleaned_essays','clean_text']].head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,cleaned_essays,clean_text
0,students english learners working english seco...,student english learner working english second...
1,students arrive school eager learn polite gene...,student arrive school eager learn polite gener...
2,true champions not always ones win guts mia ha...,true champion always one win gut mia hamm quot...
3,work unique school filled esl english second l...,work unique school filled esl english second l...
4,second grade classroom next year made around 2...,second grade classroom next year made around k...


## 3. Embedding Approaches (TF-IDF & Word2Vec)

In [21]:
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_tfidf = tfidf.fit_transform(df['clean_text'])
print("TF-IDF Shape:", X_tfidf.shape)

TF-IDF Shape: (109248, 5000)


In [22]:
sentences = [row.split() for row in df['clean_text']]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4)

def doc_vector(doc):
    vec = [w2v_model.wv[w] for w in doc.split() if w in w2v_model.wv]
    return np.mean(vec, axis=0) if len(vec) > 0 else np.zeros(100)

X_w2v = np.vstack(df['clean_text'].apply(doc_vector).values)
print("Word2Vec Shape:", X_w2v.shape)

Word2Vec Shape: (109248, 100)


## 4. Classical ML Models

In [23]:
y = df['project_is_approved']
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=500)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.05      0.10      3316
           1       0.85      0.99      0.92     18534

    accuracy                           0.85     21850
   macro avg       0.72      0.52      0.51     21850
weighted avg       0.81      0.85      0.79     21850

[[  175  3141]
 [  125 18409]]


## 5. Deep Learning Model (Functional API)

In [24]:
# split TF-IDF and Word2Vec
X_train_tfidf, X_test_tfidf, y_train_dl, y_test_dl = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)
X_train_w2v, X_test_w2v = train_test_split(X_w2v, test_size=0.2, random_state=42)

# TF-IDF branch
input_tfidf = Input(shape=(X_train_tfidf.shape[1],), name='tfidf_input')
x1 = layers.Dense(128, activation='relu')(input_tfidf)

# Word2Vec branch
input_w2v = Input(shape=(X_train_w2v.shape[1],), name='w2v_input')
x2 = layers.Dense(128, activation='relu')(input_w2v)

# merge
merged = layers.concatenate([x1, x2])
dense = layers.Dense(64, activation='relu')(merged)
output = layers.Dense(1, activation='sigmoid')(dense)

model = Model(inputs=[input_tfidf, input_w2v], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [25]:
history = model.fit([X_train_tfidf.toarray(), X_train_w2v], y_train_dl,
                    validation_split=0.2, epochs=3, batch_size=64)

results = model.evaluate([X_test_tfidf.toarray(), X_test_w2v], y_test_dl)
print("Test Loss, Test Accuracy:", results)

Epoch 1/3
[1m1093/1093[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 16ms/step - accuracy: 0.8469 - loss: 0.4051 - val_accuracy: 0.8557 - val_loss: 0.3751
Epoch 2/3
[1m1093/1093[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 16ms/step - accuracy: 0.8565 - loss: 0.3571 - val_accuracy: 0.8553 - val_loss: 0.3754
Epoch 3/3
[1m1093/1093[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 15ms/step - accuracy: 0.8717 - loss: 0.3259 - val_accuracy: 0.8479 - val_loss: 0.3984
[1m683/683[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8444 - loss: 0.4039
Test Loss, Test Accuracy: [0.4061063230037689, 0.8421968221664429]


## 6. Model Comparison & Analysis

In [26]:
!pip install gensim



In [27]:
print("Classical Logistic Regression Accuracy:", clf.score(X_test, y_test))
print("Deep Learning Accuracy:", results[1])

Classical Logistic Regression Accuracy: 0.8505263157894737
Deep Learning Accuracy: 0.8421968221664429
