# <div align="center">CP322-A Mini-Project 2: Machine Learning</div>
## <div align="center">Group 6</div>
### <div align="center">due on 12-Nov-2023 at 11:30 PM</div>

Imports:

In [22]:
import numpy as np
import os
import gzip

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## Datasets

### Dataset 1 (20 Newsgroup): 

Use the default train subset (subset=‘train’, and remove=([‘headers’, ‘footers’, ‘quotes’]) in  sklearn.datasets) to train the models and report the final performance on the test subset.  note: you need to start with the text data and convert the text to feature vectors. Please refer to https://scikitlearn.org/stable/tutorial/text_analytics/working_with_text_data.html for a tutorial on the steps needed for this.

In [18]:
categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']

twenty_train = fetch_20newsgroups(subset='train',
    categories=categories, shuffle=True, random_state=42)

d1_data = twenty_train.data
d1_labels = twenty_train.target

tfidf_vectorizer = TfidfVectorizer()
d1_tfidf = tfidf_vectorizer.fit_transform(d1_data)

count_vect = CountVectorizer()

d1_train_counts = count_vect.fit_transform(twenty_train.data)
d1_train_counts.shape

d1_tf_transformer = TfidfTransformer(use_idf=False).fit(d1_train_counts)
d1_train_tf = d1_tf_transformer.transform(d1_train_counts)
d1_train_tf.shape

(2257, 35788)

### Dataset 2 (IMDB Reviews):

In [19]:
def read_imdb():
    print("Begin reading files.")
    directory = 'data/aclImdb/train/neg'
    data = []
    labels = []
    file = ''
    for filename in os.listdir(directory):
        f = open(os.path.join(directory, filename), encoding="utf-8")
        file = filename, "neg"
        for i in f:
            test = True
            while test:
                test = False
                k = i.find('<br /><br />')
                if k != -1:
                    i = i[:k] + ' ' + i[k+12:]
                    test = True

            data.append(i)
            labels.append(0)

    print("Done reading negative values.")

    directory = 'data/aclImdb/train/pos'

    for filename in os.listdir(directory):
        #print(filename)
        f = open(os.path.join(directory, filename), encoding="utf-8")
        file = filename, "pos"
        for i in f:

            test = True
            while test:
                test = False
                k = i.find('<br /><br />')
                if k != -1:
                    i = i[:k] + ' ' + i[k+12:]
                    test = True

            data.append(i)
            labels.append(1)
        
    print("Done reading positive values.")
    npdata = np.array(data)
    nplabels = np.array(labels)
    
    return npdata, nplabels
  
d2_data, d2_labels = read_imdb()
print("File reading complete.")

Begin reading files.
Done reading negative values.
Done reading positive values.
File reading complete.


In [20]:
tfidf_vectorizer = TfidfVectorizer()
count_vect = CountVectorizer()
d2_train_counts = count_vect.fit_transform(d2_data)
d2_train_counts.shape
d2_tfidf = tfidf_vectorizer.fit_transform(d2_data)

d2_tf_transformer = TfidfTransformer(use_idf=False).fit(d2_train_counts)
d2_imdb_train_tf = d2_tf_transformer.transform(d2_train_counts)
d2_imdb_train_tf.shape

(25000, 74849)

## Models

### Logistic Regression

In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import StratifiedKFold


# Fetch the 20 newsgroups dataset
# newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Partiall dataset
# categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']


# Fetch the "train" subset of the data to be used
# newsgroups = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
newsgroups = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

# Vectorizing the text from the given dataset using TF-IDF to get the matrix
data_vectorizer = TfidfVectorizer(max_features=10000)
X_tfidf = data_vectorizer.fit_transform(newsgroups["data"])

# Defining the k-fold cross-validation with 5 folds measures to obtain the 5 groups
kFold = StratifiedKFold(n_splits=5,shuffle=True, random_state=42)

# Defining logistic regression model with 1000 interations
logistic_regression = LogisticRegression(max_iter=1000)

print(newsgroups["target_names"])
print(len(newsgroups["target"]))
print(len(newsgroups["data"]))


['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
11314
11314


### Decision Trees

#### Dataset 1: Newsgroups

In [67]:
#train and test data for newsgroups dataset
X_train, X_test, y_train, y_test = train_test_split(d1_tfidf, d1_labels, test_size=0.2, random_state=42)

decision_tree_classifier = DecisionTreeClassifier(criterion="gini", splitter="best", random_state=42, min_samples_split=10)

decision_tree_classifier.fit(X_train, y_train)

predicted_y = decision_tree_classifier.predict(X_test)

dt_accuracy = accuracy_score(y_test, predicted_y)
dt_precision = precision_score(y_test, predicted_y, average="weighted")
dt_recall = recall_score(y_test, predicted_y, average="weighted")
dt_f1_score = f1_score(y_test, predicted_y, average="weighted")

print(f"Accuracy of Decision Tree: {dt_accuracy:.4f}")
print(f"Precision of Decision Tree: {dt_precision:.4f}")
print(f"Recall of Decision Tree: {dt_recall:.4f}")
print(f"F1 Score of Decision Tree: {dt_f1_score:.4f}")


Accuracy of Decision Tree: 0.7832
Precision of Decision Tree: 0.7843
Recall of Decision Tree: 0.7832
F1 Score of Decision Tree: 0.7830


#### Dataset 2: IMDB

In [65]:
#train and test for imdb dataset
X_train, X_test, y_train, y_test = train_test_split(d2_tfidf, d2_labels, test_size=0.2, random_state=42)

decision_tree_classifier = DecisionTreeClassifier(criterion="gini", splitter="best", random_state=42, min_samples_leaf=10, max_leaf_nodes=110, min_samples_split=20)

decision_tree_classifier.fit(X_train, y_train)

predicted_y = decision_tree_classifier.predict(X_test)

dt_accuracy = accuracy_score(y_test, predicted_y)
dt_precision = precision_score(y_test, predicted_y, average="weighted")
dt_recall = recall_score(y_test, predicted_y, average="weighted")
dt_f1_score = f1_score(y_test, predicted_y, average="weighted")

print(f"Accuracy of Decision Tree: {dt_accuracy:.4f}")
print(f"Precision of Decision Tree: {dt_precision:.4f}")
print(f"Recall of Decision Tree: {dt_recall:.4f}")
print(f"F1 Score of Decision Tree: {dt_f1_score:.4f}")

Accuracy of Decision Tree: 0.7376
Precision of Decision Tree: 0.7383
Recall of Decision Tree: 0.7376
F1 Score of Decision Tree: 0.7375


### Support Vector Machines

In [None]:
from sklearn.svm import LinearSVC


svm_model = LinearSVC()

### Ada Boost

### Random Forest

## Validation

In [None]:
from sklearn.model_selection import cross_val_score,  KFold
import pandas as pd


model_results = {
    "Model": [],
    "Dataset": [],
    "Accuracy (Mean)": [] 
}

kf = KFold(n_splits=5)

svm_scores = cross_val_score(svm_model, X_tfidf, Y, cv=kf)
svm_accuracy_mean = svm_scores.mean()

model_results["Model"].append("Support Vector Machines")
model_results["Dataset"].append("20 Newsgroup")
model_results["Accuracy (Mean)"].append(svm_accuracy_mean)

# DataFrame to report the performance of SVM
results_df = pd.DataFrame(model_results)

print(results_df)