In [None]:
## Task 1

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

#TO Encode, Scale and split data
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#The Models we are going to use
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

#To make a print_score function
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [None]:
# Load the data and preprocess it
dataset =  pd.read_csv("agaricus-lepiota.csv")

In [None]:
#Want to rename the colums, so its easier to evaluate
dataset.columns = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment',
                   'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
                   'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type',
                   'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']


In [None]:
#Label Encoding, so that the cells contain corresponding number for the character, and replaces it.
Encoder = LabelEncoder()
for col in dataset.columns:
    dataset[col] = Encoder.fit_transform(dataset[col])

In [None]:
dataset.head()
#dataset.info() #Shows that every row has non-null values

In [None]:
# Want to get a quick overview of how many mushrooms are poisonous or not. We can see that there are roughly the same
# amount of poisonous and edible
#sns.countplot(x='class', data=dataset)

Then we want to generate a heatmap to visualize the correlation between the attributes.

We want to use the heatmap to identify which features are strongly correlated with the target variable ('class') and with each other. This can help us determine which features we should drop in order to create a more accurate model.

In [None]:
#sns.heatmap(dataset.corr())

As the heatmap show above, white cells shows a high correlation, and if the attribute is strongly correlated with the target variable 'class' it is not a useful feature for the prediction and should be dropped. Also, we can see that some attributes are highly correleted with eachother, we would need to choose to drop one of them to avoid multicollinearity.

In [None]:
#Need to drop some coloms because of the logical rules in the dataset.
#This is features that are the most indicative, and we would therefore drop these before running the models.
drop_features = ['odor', 'spore-print-color', 'habitat', 'stalk-shape','gill-size','gill-spacing','bruises',
                 'gill-color','stalk-root','ring-type','stalk-surface-below-ring','stalk-surface-above-ring',
                 'population', 'cap-color']
dataset = dataset.drop(drop_features, axis=1)

In [None]:
X=dataset.drop('class',axis=1) #Predictors 'p'="class", This is what we are trying to predict.
y=dataset['class'] #Response, the data we have to work with, is then the rest of the dataset

In [None]:
#scalar = StandardScaler()
#X = pd.DataFrame(scalar.fit_transform(X), columns = X.columns)

In [None]:
#Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=38)

We want to use KNN, DecisionTreeclassifier and RandomForrestClassifier.
The reason for each is:

We want to use KNN because its a "lazy" algorithm that is easy to implement and interpret. KNN is very efficient and accurate for small datasets, and we think therefore this is a good fit for our dataset. We are going to use gridsearch to get the most optimal number of neighbours.

Random Forrest is a useful algorithm for the mushroom dataset because it can handle large amounts of data and noisy or missing data. Since the mushroom dataset contains many different attributes the Random Forest can be used to identify which attributes are most important for classification by creating multiple decision trees and combining their results.

Decision Tree is well-suited for the mushroom dataset since it can handle non-linear classification problems and can capture the complex interactions between the features. Additionally, decision trees are easy to interpret, which can be useful for understanding which features are most important for classification.

Overall, the combination of these three algorithms provides a good balance between accuracy, interpretability, and robustness for the classification task.

In [None]:
#Grid search to find best parameter
knn_gs = KNN()
param_grid = {'n_neighbors': range(1, 31)}  # Define the range of neighbors to test
grid_search = GridSearchCV(knn_gs, param_grid, cv=5)
grid_search.fit(X_train,y_train)
best_param = grid_search.best_params_['n_neighbors']# get the best parameter

#KNN
knn = KNN(n_neighbors=best_param)
knn.fit(X_train,y_train)

#DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_depth=5)
dtc.fit(X_train,y_train)

#RandomForrestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

In [None]:
#A function to print a classification report for the different models.
def print_score(classifier,X_train,y_train,X_test,y_test, name):
    print("Training results for", name, ":\n")
    print('Classification Report:\n{}\n'.format(classification_report(y_train,classifier.predict(X_train))))

print_score(knn,X_train,y_train,X_test,y_test,"KNN")
print_score(dtc,X_train,y_train,X_test,y_test,"DTC")
print_score(rfc,X_train,y_train,X_test,y_test,"RFC")

c) Our best performing model is the RandomForrest classifier, as the average score is 84%. We would still not trust the model as there is still 20% that is wrong, and that can be dangerous. The dataset was originally sufficient because it would give a score of 100% for all three models. To really test the models we would need to decrease the numbers of colums and avoid the logical rules of the dataset.

The decision tree classifier is a tree-like model that splits the data based on the most significant feature until a prediction can be made. We think the random forrest classifier perform best because it is an ensemble of decision trees, where each tree makes a prediction and the final prediction is made by combining the predictions of all trees. Therefore, the RFC is even more precise than the dtc.

## Task 2

This first part of the task I will be evaluating my preprocessing. Before starting with the real Sentiment Analysis.

In [None]:
import json
import nltk

# Extracts only the text part of the document.
def data_processing(file_name) -> list:
    message_data = []
    with open(f'{file_name}', "r") as training_data:
        train_array = json.load(training_data)

        #Train_array is a array with dictionaries, each having three elements each.
        for category in train_array:
            #category is the index of the list with the dictionary.

            message_data.append(category["text"])

    return message_data

I ask my group leader who gave a good advice when it comes to tokenizing data and removing the stopwords.

He adviced me to sanitize the data by removing the tokens while tokenizing them, which was a smart way of avoiding unnecessery high amount of list accesses. The function sentance_sanitized_tokening() takes the list with full sentances, and then iterates through each tokenized word in the sentance, in order to check if it exists in dummyWords which is a english stopword list from nltk, if the word exists in the list it just goes to the next iteration/word. If it ain't then it adds it to the list of words that is to be added to the output list.

Sentance tokenizing and stopword removing function:

In [None]:
def sentance_sanitized_tokening(data: dict) -> list:
    tokenized_list = []
    dummyWords = nltk.corpus.stopwords.words("english")

    # data_part = ["text", "label"]
    for data_part in data:
        sentance, label = data_part
        tokened_sentance = nltk.word_tokenize(sentance)

        # Each sentance has it's own filtered_tokens list containing the words that is not stopwords.
        filtered_tokens = []

        for word in tokened_sentance:
            assert type(word)==str, "Words inside the dictionary passed to tokenize(data) is not of type string!!"
            # I considered using remove() function but that would have a very high impact on programs runtime which I prefere to avoid.

            if word not in dummyWords:
                filtered_tokens.append(word)
            else:
                pass

        # When all the words from the sentance that were not stopwords was added then append them to the output (tokenized_list)
        tokenized_list.append([filtered_tokens, label])

    return tokenized_list

One thing I considered was weather the symbol "!" should be removed. It may be used to help characterized Negative words.

In [None]:
## Task 3

In [None]:
from matplotlib import pyplot as plt
import numpy as np
import torch

import torchvision
from torchvision import datasets, transforms

from collections import Counter
from torch.utils.data import random_split

In [None]:
seed = 10
torch.manual_seed(seed)

category_index_manual = 8
n_val = 5000

data_path_manual = '/cifar-10-batches-py'

Function for loading the Cifar10 dataset.

The method will have to be run twice. The first time we load it as tensors and split the set between training, validation, and testing. We only want to identify one type of image (ships), so we get the labels of all the images and set all to "false" if they are not ships.

After running the method for the first time we get create a normalizer from the std and mean of the images. The method is then ran for a second time with the normalizer as the preprocessor.

def load_dataset(n_val, category_index, data_path, preprocessor):
    transformed_cifar10_train_val = datasets.CIFAR10(
        data_path,
        train=True,
        download=False,
        transform = preprocessor
    )

    transformed_cifar10_test = datasets.CIFAR10(
        data_path,
        train=False,
        download=False,
        transform = preprocessor

    )


    n_train = len(transformed_cifar10_train_val)-n_val

    transformed_cifar10_train_split, transformed_cifar10_val_split = random_split(
        transformed_cifar10_train_val,
        [n_train, n_val],

        generator=torch.Generator().manual_seed(seed)
    )

    print("Size of the train dataset:        ", len(transformed_cifar10_train_split))
    print("Size of the validation dataset:   ", len(transformed_cifar10_val_split))
    print("Size of the test dataset:         ", len(transformed_cifar10_test))

load_dataset(n_val, category_index_manual, data_path_manual, preprocessor=transforms.ToTensor())

In [None]:
train_labels = np.array(transformed_cifar10_train_val)
train_labels = np.array(train_labels == category_index).astype(int)

test_labels = np.array(transformed_cifar10_test)
test_labels = np.array(test_labels == category_index).astype(int)


In [None]:
imgs = torch.stack([img for img, _ in transformed_cifar10_train_val])

normalizer = transforms.Normalize(mean = imgs.mean(dim=(0, 2, 3)), std = imgs.std(dim=(0, 2, 3)))

In [None]:
from torchvision import models
import torch.nn as nn
import torch.optim as optim

In [None]:
# Loading pre-trained ResNet18 model
model = models.resnet18(pretrained=True)

# Modify the last layer for binary classification
# Legg til markdown om at vi løser det som binary classification
num_features = model.fc.in_features
model.fc = nn.Linear(num_features,1)

# "Remember to use a suitable loss function like Binary Cross Entropy with Logits Loss (BCEWithLogitsLoss) and an optimizer like Adam or SGD for training.
loss_function = nn.BCEWithLogitsLoss()

# Legge til spørsmål om vi burde bruke SGD, Adam, RMSprop eller noe annet
# Legge til spørsmål om vi burde endre på learning raten, eller i alle fall forklare hvorfor vi bruker den vi bruker
optimizer = optim.adam(model.parameters(),  lr=0.001)