In [None]:
# Part1
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

# Fetch the 20 newsgroups dataset
# newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Partiall dataset
# categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']


# Fetch the "train" subset of the data to be used
# newsgroups = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
newsgroups = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

# Vectorizing the text from the given dataset using TF-IDF to get the matrix
data_vectorizer = TfidfVectorizer(max_features=10000)
X_tfidf = data_vectorizer.fit_transform(newsgroups.data)

# Defining the k-fold cross-validation with 5 folds measures to obtain the 5 groups
kFold = StratifiedKFold(n_splits=5,shuffle=True, random_state=42)

# Defining logistic regression model with 1000 interations
logistic_regression = LogisticRegression(max_iter=1000)

# Defining place holder lists for the metrics for each of the resulting folds
accuracies_array = []
precisions_array = []
recalls_array = []
# Implementing k-fold cross-validation using the test and train data
for train_position, test_position in kFold.split(X_tfidf, newsgroups.target):
    X_trainData= X_tfidf[train_position]
    X_testData = X_tfidf[test_position]
    y_trainData = newsgroups.target[train_position]
    y_testData= newsgroups.target[test_position]

    
    #Using the training portion of data to train it for our Logistic regression model
    logistic_regression.fit(X_trainData, y_trainData)

    # Defining our predicts on the test data of the dataset
    y_predictData = logistic_regression.predict(X_testData)

    # Analysing the models using each of the k-folds
    accuracy = accuracy_score(y_testData, y_predictData)
    classification_report_dict = classification_report(y_testData, y_predictData, output_dict=True)
    
    
    # Adding the metrictics int the defined lists to be used for final average
    accuracies_array.append(accuracy)
    precisions_array.append(classification_report_dict['macro avg']['precision'])
    recalls_array.append(classification_report_dict['macro avg']['recall'])

    # Dispaying the results of the metrics of the 5 folds
    print(f'Accuracy: {accuracy:.2f}')
    print(classification_report(y_testData, y_predictData))

# Aggregate and displays the average rsults of the metrics using all the k-folds.
avg_accuracy = sum(accuracies_array) / len(accuracies_array)
avg_precision = sum(precisions_array) / len(precisions_array)
avg_recall = sum(recalls_array) / len(recalls_array)

# Final display of the results
print("\nAverage Metrics Across the 5 K-Folds:")
print(f'Average Accuracy of the dataset is: {avg_accuracy:.2f}')
print(f'Average Precision of the dataset is: {avg_precision:.2f}')
print(f'Average Recall of the dataset is: {avg_recall:.2f}')





# Part2


def read_train_imdb():
    directory = 'data/train/neg'
    data = []
    labels = []
    file = ''
    try:
        for filename in os.listdir(directory):
            f = open(os.path.join(directory, filename), encoding="utf-8")
            file = filename, "neg"
            for i in f:
                test = True
                while test:
                    test = False
                    k = i.find('<br /><br />')
                    if k != -1:
                        i = i[:k] + ' ' + i[k+12:]
                        test = True
                    
                data.append(i)
                labels.append(0)
                     
        directory = 'data1/train/pos'
        
        for filename in os.listdir(directory):
            f = open(os.path.join(directory, filename), encoding="utf-8")
            file = filename, "pos"
            for i in f:
             
                test = True
                while test:
                    test = False
                    k = i.find('<br /><br />')
                    if k != -1:
                        i = i[:k] + ' ' + i[k+12:]
                        test = True
                     
                data.append(i)
                labels.append(1)
                
    except Exception as e: 
        print(e)
        print("Error found in file:", file)
        
    npdata = np.array(data)
    nplabels = np.array(labels)
    
    return npdata, nplabels
  
train_data, train_labels = read_train_imdb()

def read_test_imdb():
    directory = 'data/test/neg'
    data = []
    labels = []
    file = ''
    try:
        for filename in os.listdir(directory):
            f = open(os.path.join(directory, filename), encoding="utf-8")
            file = filename, "neg"
            for i in f:
                test = True
                while test:
                    test = False
                    k = i.find('<br /><br />')
                    if k != -1:
                        i = i[:k] + ' ' + i[k+12:]
                        test = True
                    
                data.append(i)
                labels.append(0)
                
        
        directory = 'data/test/pos'
        
        for filename in os.listdir(directory):
            f = open(os.path.join(directory, filename), encoding="utf-8")
            file = filename, "pos"
            for i in f:
             
                test = True
                while test:
                    test = False
                    k = i.find('<br /><br />')
                    if k != -1:
                        i = i[:k] + ' ' + i[k+12:]
                        test = True
                     
                data.append(i)
                labels.append(1)
                
    except Exception as e: 
        print(e)
        print("Error found in file:", file)
    
    npdata = np.array(data)
    nplabels = np.array(labels)
    
    return npdata, nplabels
  
test_data, test_labels = read_test_imdb()

# Vectorizing the text from the given dataset using TF-IDF to get the matrix
# data_vectorizer = TfidfVectorizer(max_features=10000)
data_vectorizer = TfidfVectorizer(analyzer = 'word',max_features=10000)
X_tfidf = data_vectorizer.fit_transform(train_data)

# Defining the k-fold cross-validation with 5 folds measures to obtain the 5 groups
kFold = StratifiedKFold(n_splits=5,shuffle=True, random_state=42)

# Defining logistic regression model with 1000 interations
logistic_regression = LogisticRegression(max_iter=1000)

# Defining place holder lists for the metrics for each of the resulting folds
accuracies_array = []
precisions_array = []
recalls_array = []

# # Implementing k-fold cross-validation using the test and train data
for train_position, test_position in kFold.split(X_tfidf, train_labels):
    X_trainData = X_tfidf[train_position]
    X_testData = X_tfidf[test_position]
    y_trainData = train_labels[train_position]
    y_testData = train_labels[test_position]


    
    #Using the training portion of data to train it for our Logistic regression model
    logistic_regression.fit(X_trainData, y_trainData)

    # Defining our predicts on the test data of the dataset
    y_predictData = logistic_regression.predict(X_testData)

    # Analysing the models using each of the k-folds
    accuracy = accuracy_score(y_testData, y_predictData)
    classification_report_dict = classification_report(y_testData, y_predictData, output_dict=True)
    
    
    # Adding the metrictics int the defined lists to be used for final average
    accuracies_array.append(accuracy)
    precisions_array.append(classification_report_dict['macro avg']['precision'])
    recalls_array.append(classification_report_dict['macro avg']['recall'])

    # Dispaying the results of the metrics of the 5 folds
    print(f'Accuracy: {accuracy:.2f}')
    print(classification_report(y_testData, y_predictData))

# Aggregate and displays the average rsults of the metrics using all the k-folds.
avg_accuracy = sum(accuracies_array) / len(accuracies_array)
avg_precision = sum(precisions_array) / len(precisions_array)
avg_recall = sum(recalls_array) / len(recalls_array)

# Final display of the results
print("\nAverage Metrics Across the 5 K-Folds:")
print(f'Average Accuracy of the dataset is: {avg_accuracy:.2f}')
print(f'Average Precision of the dataset is: {avg_precision:.2f}')
print(f'Average Recall of the dataset is: {avg_recall:.2f}')




