In [None]:
## Task 1

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

#TO Encode, Scale and split data
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#The Models we are going to use
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

#To make a print_score function
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [None]:
# Load the data and preprocess it
dataset =  pd.read_csv("agaricus-lepiota.csv")

In [None]:
#Want to rename the colums, so its easier to evaluate
dataset.columns = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment',
                   'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
                   'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type',
                   'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']


In [None]:
#Label Encoding, so that the cells contain corresponding number for the character, and replaces it.
Encoder = LabelEncoder()
for col in dataset.columns:
    dataset[col] = Encoder.fit_transform(dataset[col])

In [None]:
dataset.head()
#dataset.info() #Shows that every row has non-null values

In [None]:
# Want to get a quick overview of how many mushrooms are poisonous or not. We can see that there are roughly the same
# amount of poisonous and edible
#sns.countplot(x='class', data=dataset)

Then we want to generate a heatmap to visualize the correlation between the attributes.

We want to use the heatmap to identify which features are strongly correlated with the target variable ('class') and with each other. This can help us determine which features we should drop in order to create a more accurate model.

In [None]:
#sns.heatmap(dataset.corr())

As the heatmap show above, white cells shows a high correlation, and if the attribute is strongly correlated with the target variable 'class' it is not a useful feature for the prediction and should be dropped. Also, we can see that some attributes are highly correleted with eachother, we would need to choose to drop one of them to avoid multicollinearity.

In [None]:
#Need to drop some coloms because of the logical rules in the dataset.
#This is features that are the most indicative, and we would therefore drop these before running the models.
drop_features = ['odor', 'spore-print-color', 'habitat', 'stalk-shape','gill-size','gill-spacing','bruises',
                 'gill-color','stalk-root','ring-type','stalk-surface-below-ring','stalk-surface-above-ring',
                 'population', 'cap-color']
dataset = dataset.drop(drop_features, axis=1)

In [None]:
X=dataset.drop('class',axis=1) #Predictors 'p'="class", This is what we are trying to predict.
y=dataset['class'] #Response, the data we have to work with, is then the rest of the dataset

In [None]:
#scalar = StandardScaler()
#X = pd.DataFrame(scalar.fit_transform(X), columns = X.columns)

In [None]:
#Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=38)

We want to use KNN, DecisionTreeclassifier and RandomForrestClassifier.
The reason for each is:

We want to use KNN because its a "lazy" algorithm that is easy to implement and interpret. KNN is very efficient and accurate for small datasets, and we think therefore this is a good fit for our dataset. We are going to use gridsearch to get the most optimal number of neighbours.

Random Forrest is a useful algorithm for the mushroom dataset because it can handle large amounts of data and noisy or missing data. Since the mushroom dataset contains many different attributes the Random Forest can be used to identify which attributes are most important for classification by creating multiple decision trees and combining their results.

Decision Tree is well-suited for the mushroom dataset since it can handle non-linear classification problems and can capture the complex interactions between the features. Additionally, decision trees are easy to interpret, which can be useful for understanding which features are most important for classification.

Overall, the combination of these three algorithms provides a good balance between accuracy, interpretability, and robustness for the classification task.

In [None]:
#Grid search to find best parameter
knn_gs = KNN()
param_grid = {'n_neighbors': range(1, 31)}  # Define the range of neighbors to test
grid_search = GridSearchCV(knn_gs, param_grid, cv=5)
grid_search.fit(X_train,y_train)
best_param = grid_search.best_params_['n_neighbors']# get the best parameter

#KNN
knn = KNN(n_neighbors=best_param)
knn.fit(X_train,y_train)

#DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_depth=5)
dtc.fit(X_train,y_train)

#RandomForrestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

In [None]:
#A function to print a classification report for the different models.
def print_score(classifier,X_train,y_train,X_test,y_test, name):
    print("Training results for", name, ":\n")
    print('Classification Report:\n{}\n'.format(classification_report(y_train,classifier.predict(X_train))))

print_score(knn,X_train,y_train,X_test,y_test,"KNN")
print_score(dtc,X_train,y_train,X_test,y_test,"DTC")
print_score(rfc,X_train,y_train,X_test,y_test,"RFC")

c) Our best performing model is the RandomForrest classifier, as the average score is 84%. We would still not trust the model as there is still 20% that is wrong, and that can be dangerous. The dataset was originally sufficient because it would give a score of 100% for all three models. To really test the models we would need to decrease the numbers of colums and avoid the logical rules of the dataset.

The decision tree classifier is a tree-like model that splits the data based on the most significant feature until a prediction can be made. We think the random forrest classifier perform best because it is an ensemble of decision trees, where each tree makes a prediction and the final prediction is made by combining the predictions of all trees. Therefore, the RFC is even more precise than the dtc.

## Task 2

This first part of the task I will be evaluating my preprocessing. Before starting with the real Sentiment Analysis.

In [1]:
import json
import nltk
import random
import spacy

# This is the Norwegian language package with all extentions to remove redundant
# filler words from the data:
from spacy.lang.nb import Norwegian
from spacy.lang.nb.stop_words import STOP_WORDS

data_processing extracts the data from the json file provided in a structured way: ["sentance", "sentiment"]

In [2]:
def data_processing(file_name) -> list:
    message_data = []
    with open(f'3class/{file_name}', "r") as training_data:
        train_array = json.load(training_data)

        #Train_array is a array with dictionaries, each having three elements each.
        for category in train_array:
            #category is the index of the list with the three elements: sent_id, text, label
            message_data.append([category["text"], category["label"]]) # This appends [text, label] to the array.

        training_data.close()

    return message_data

In [3]:
train = data_processing("train.json")
# test = data_processing("test.json")

# print("Training data: \n", train, "\n", "Testing data: \n", test, "\n")

I heard from my seminar leader that there is a python extention named "spaCy" which is very helpful when it comes to preprocessing the data. And so I found the .lemma_  attribute of spaCy which could imporve the performance of the algorithms even more by lemmatizing the words in the data, in order to make them more understandable for processing. So I ran the train and test data through a spaCy lemmatizer and it was very effective.

Later when working with this project I struggled with removing the non-alphabetic characters from the dataset and more so to do that in an efficient way. Without needing to access the data unnecessary many times. And once again my seminar leader gave me a good advice to performe both lemmatizing and stopwords removal in one function. And so I did.

In [4]:
def the_lemmaSanitizer(data):
    lemmatized_data = []
    lemmatizer = spacy.load("nb_core_news_sm") # Norwegian lemmatization model.

    for chunk in data: # chunk is a index in data that both contains the sentance and it's sentiment. Not edited.

        that_text_part_of_the_chunk, that_sentiment_part_of_the_chunk = chunk[0], chunk[1]

        # pre_lemmed is a sentance that is put in the spacy format, I do this in order
        # to later be able to use nlp's attribute to check for non-alphabetic characters.
        pre_lemmed = lemmatizer(that_text_part_of_the_chunk)

        lemmed = []
        for root_word in pre_lemmed:
            # I will comment on this if statement later.
            if (not root_word.is_punct
                    and not root_word.is_currency
                    and not root_word.is_digit
                    and not root_word.is_space
                    and not root_word.is_stop
                    and not root_word.like_num):
                lemmed.append(root_word.lemma_) # .lemma_ finds the lemma of "root_word"

        lemmatized_data.append([lemmed, that_sentiment_part_of_the_chunk])

    return lemmatized_data

I was considering wether using these if statements in the for loop was a good idea or not. It was either this or using (string).is_alpha function, but the way I choose to implement this also removes stopwords and in general is more flexible to potentially edit later on. The fact that it removes stopwords was very helpfull in effective preprocessing of the data.

Previously I had a function that did the tokanization first, then removed the stopwords. And my original idea was to first performe these to tasks and then lemmatize the data but I managed to lemmatize, tokenize and remove the stopwords from the data at the same time.

Performing Tokanization was shown to be a very simple task since tokenizing data is basicly to separate the sentance into simple words, which preferably should be lemmatize. But the tokanization must happen in order to be able to lemmatize the words so combining those two tasks in one function was a cleaver thing to do.

Thought: Can I be overfitting the data with filtering out all the stopwords?

In [5]:
train = the_lemmaSanitizer(train)
#test = the_lemmaSanitizer(test)

print("Training data: ", train)
#print("Testing data: ", test)

Training data:  [[['philips', '190G6'], 'Neutral'], [['integrert', 'høyttal', 'måte', 'diskre', 'plassere', 'subwoof', 'inkludere', 'snakke', 'gutteskjerm'], 'Neutral'], [['bedra', 'skinn'], 'Negative'], [['mange', 'skjerme', 'diskre', 'design', 'smal', 'ramme', 'slank', 'fot'], 'Neutral'], [['190G6', 'Philips', 'historie'], 'Neutral'], [['utseende', 'kreve', 'oppmerksomhet', 'glinse', 'svart', 'ramme', 'glansbelegg', 'skjermflat', 'sølvfarge', 'sidepanel', 'innfelle', 'høyttaler', 'svart', 'deksel'], 'Neutral'], [['LES'], 'Neutral'], [['fot', 'blank', 'Søyle', 'knapp', 'blå', 'lys'], 'Neutral'], [['bakside', 'sort', 'blank', 'skinne', 'deksel', 'skjule', 'kontakt', 'kable'], 'Positive'], [['høyttalerbrønn', 'stikke', 'tydelig', 'snakk', 'gjemme'], 'Neutral'], [['Likegyldig', 'uansett', 'vanskelig'], 'Neutral'], [['betjening'], 'Positive'], [['midt', 'finne', 'volumknapp', 'rotere', 'fri'], 'Neutral'], [['nivå', 'lese', 'skjerm', 'dukke', 'skala', 'snar', 'skru', 'knappe'], 'Neutral'],

In [None]:
## Task 3

In [37]:
from matplotlib import pyplot as plt
import numpy as np
import torch

import torchvision
from torchvision import datasets, transforms

from collections import Counter
from torch.utils.data import random_split

In [38]:
seed = 10
torch.manual_seed(seed)

category_index = 8
n_val = 5000

data_path = '/cifar-10-batches-py'

Function for loading the Cifar10 dataset.

The method will have to be run twice.
After running the method for the first time we get create a normalizer from the std and mean of the images. The method is then ran for a second time with the normalizer as the preprocessor.

Loading the CIFAR-10 dataset as tensors.

In [39]:
transformed_cifar10_train_val = datasets.CIFAR10(
    data_path,
    train=True,
    download=False,
    transform = transforms.ToTensor()
)

Stacking the set of images into a single tensor. We then create a normalizer for the dataset around the mean and standard deviation of the 3 dimensions (height, width channel (color)).

In [None]:
imgs = torch.stack([img for img, _ in transformed_cifar10_train_val])

normalizer = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean = imgs.mean(dim=(0, 2, 3)), std = imgs.std(dim=(0, 2, 3)))])

Loading the dataset as tensors for training+validation and testing. This time we apply the composition of transforms.

In [None]:
normalized_cifar10_train_val = datasets.CIFAR10(
    data_path,
    train=True,
    download=False,
    transform = normalizer
)


transformed_cifar10_test = datasets.CIFAR10(
    data_path,
    train=False,
    download=False,
    transform = transforms.Compose([transforms.ToTensor(),normalizer])
)

As this is a binary classification problem where we only want to identify whether an image is a ship or not, we can set the labels that are "ship" to true. We set all other labels to false.

In [None]:
train_labels = np.array(transformed_cifar10_train_val)
train_labels = np.array(train_labels == category_index).astype(int)

test_labels = np.array(transformed_cifar10_test)
test_labels = np.array(test_labels == category_index).astype(int)

Splitting the training and validation set randomly.

In [None]:
n_train = len(transformed_cifar10_train_val)-n_val

transformed_cifar10_train_split, transformed_cifar10_val_split = random_split(
    transformed_cifar10_train_val,
    [n_train, n_val],

    generator=torch.Generator().manual_seed(seed)
)

print("Size of the train dataset:        ", len(transformed_cifar10_train_split))
print("Size of the validation dataset:   ", len(transformed_cifar10_val_split))
print("Size of the test dataset:         ", len(transformed_cifar10_test))

Counter([label for _, label in transformed_cifar10_train_split])

Choosing a pre-trained CNN model: we chose ResNet18, which is not trained on Cifar-10.

In [None]:
from torchvision import models
import torch.nn as nn
import torch.optim as optim

Loading pre-trained ResNet18 model and modifying the last layer. We are doing binary classification, so we think we only need one node in the final layer.

In [None]:
model = models.resnet18(pretrained=True)

num_features = model.fc.in_features
model.fc = nn.Linear(num_features,1)

We use Binary Cross Entropy as it should be suitable for binary classification problems (add reasoning and explanation). We use nn.BCEWithLogitsLoss() as it combines the sigmoid activation function and the BCE into a single class.

The optimizer we use is Adam, and we will begin with a learning rate of 0.001, just because it is a commonly used learning rate.

In [None]:
loss_function = nn.BCEWithLogitsLoss()


optimizer = optim.adam(model.parameters(),  lr=0.001)