***Question 1***

In [30]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD  # for LSA
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

In [31]:
ads = pd.read_csv("https://raw.githubusercontent.com/rbaid-9/Schulich_AI/main/farm-ads.csv" , header=None, names=['ad'])

# Preprocessing
stop_words = set(stopwords.words('english')) 
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(ads['ad'])

In [32]:
term_doc_matrix = X.toarray()
print("Term-Document Matrix:")

term_doc_df = pd.DataFrame(term_doc_matrix)
term_doc_df.head()

Term-Document Matrix:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,47397,47398,47399,47400,47401,47402,47403,47404,47405,47406
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
# Dimensionality Reduction (LSA)
lsa = TruncatedSVD(n_components=20)
concept_doc_matrix = lsa.fit_transform(X)

print("\nConcept-Document Matrix:")
concept_doc_df = pd.DataFrame(concept_doc_matrix, columns=[f'Concept {i+1}' for i in range(concept_doc_matrix.shape[1])])


Concept-Document Matrix:


In [34]:
concept_doc_df

Unnamed: 0,Concept 1,Concept 2,Concept 3,Concept 4,Concept 5,Concept 6,Concept 7,Concept 8,Concept 9,Concept 10,Concept 11,Concept 12,Concept 13,Concept 14,Concept 15,Concept 16,Concept 17,Concept 18,Concept 19,Concept 20
0,4.820819,3.244891,-4.323800,3.926149,-5.022188,-0.426192,6.521385,1.884530,-1.386161,3.244380,1.011257,-1.849159,-0.944104,-0.220115,-0.715066,-0.577989,-0.581281,0.523397,-0.213442,-1.453552
1,3.469492,3.135591,-3.711481,2.079121,-3.353587,-0.154063,4.469310,0.926810,-0.852358,1.891415,0.549301,-1.022083,-0.256895,-0.299846,-0.346762,-0.142092,-0.638800,-0.234484,-0.001062,-0.291519
2,4.882747,3.815947,-4.311637,2.388987,-3.845211,-0.171353,4.858159,0.754584,-0.859031,2.025074,0.533195,-1.132673,-0.139678,-0.417009,-0.402852,-0.126767,-0.830100,-0.557250,-0.447872,0.184718
3,4.252620,5.155948,-5.424388,3.175506,-1.439190,-0.591319,4.585351,1.249264,-1.025862,1.598834,0.266992,0.283258,-0.010734,-0.662123,-0.288448,-0.042577,-1.058323,-0.241746,0.182305,0.364397
4,5.173128,5.734124,-6.220281,5.102541,0.402257,-1.200593,4.882575,1.785923,-1.199996,1.805845,0.424123,-1.053196,-0.533517,-0.636799,-0.411203,0.133484,-1.223155,-0.276176,0.235761,-0.188938
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4138,46.885382,225.379950,229.229110,19.719068,-1.920906,-8.999835,2.996875,8.365889,1.549482,11.804376,-1.047569,-3.789121,0.141339,2.139152,7.800078,-13.500942,-16.477981,2.283059,-2.845083,-13.179244
4139,47.471083,227.162272,230.507360,19.812460,-2.561689,-9.641252,3.465807,8.406720,1.224578,11.856617,-0.462622,-4.125791,0.309658,2.115249,6.557796,-10.812166,-15.883576,2.199999,-2.718894,-14.507097
4140,47.132553,226.986381,230.177544,19.883903,-2.736677,-10.374459,3.456389,8.388705,1.316279,11.838308,-0.494691,-4.054373,0.355585,2.163203,6.527701,-10.692285,-15.981133,2.164059,-2.621383,-14.374633
4141,48.197316,226.009566,231.037194,20.342717,-1.896809,-7.499180,3.207501,7.901004,1.124833,13.348240,-0.427080,-4.150350,0.371051,2.074014,7.053100,-10.936975,-16.264734,2.924193,-1.261143,-16.214097


***Question 2***

In [35]:
# Get the terms from the CountVectorizer
terms = vectorizer.get_feature_names_out()

term_doc_df = pd.DataFrame(term_doc_matrix, columns=terms)

# Find non-zero entries
non_zero_entries = term_doc_df[term_doc_df != 0].stack().reset_index()
non_zero_entries.columns = ['Document Index', 'Term Index', 'Frequency']

In [36]:
non_zero_entries.head()

Unnamed: 0,Document Index,Term Index,Frequency
0,0,aaa,1.0
1,0,abdominal,1.0
2,0,ad,13.0
3,0,aneurysm,1.0
4,0,aortic,1.0


***Question 3***

In [37]:
# Check for missing values
if ads.isnull().values.any():
    print("Warning: Missing values found in the data!")

# Extract labels
y = ads.index.values

# Extract features and convert them to numerical vectors
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(ads['ad'])

# Split the data into training and validation sets (75% training, 25% validation)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=42)

# Train logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on validation set
y_pred = model.predict(X_valid)

# Evaluate the model
accuracy = accuracy_score(y_valid, y_pred)
print("Validation Accuracy:", accuracy)

# Classification report
print("Classification Report:")
print(classification_report(y_valid, y_pred))


Validation Accuracy: 0.915057915057915
Classification Report:
              precision    recall  f1-score   support

          -1       0.94      0.88      0.91       504
           1       0.89      0.95      0.92       532

    accuracy                           0.92      1036
   macro avg       0.92      0.91      0.91      1036
weighted avg       0.92      0.92      0.91      1036

