#Import the right libraries

In [None]:
import seaborn as sns
sns.set_palette('husl')
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import hopsworks
import pandas as pd

In [None]:
project = hopsworks.login()
fs = project.get_feature_store()

In [None]:
wine_df = pd.read_csv('https://raw.githubusercontent.com/ID2223KTH/id2223kth.github.io/master/assignments/lab1/wine.csv')
wine_df 


# Transform column names into valid names   

In [None]:
wine_df.columns = wine_df.columns.str.replace(' ', '_')
wine_df.columns = wine_df.columns.str.lower()
wine_df


## Seperate dataframe into features and labels so that we can do standardization

In [None]:
y_wine = wine_df['quality']
X_wine = wine_df.drop(['quality'], axis=1)

## Plot to understand the distribution of the labels

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'df' is your DataFrame and 'label_column' is the column you want to visualize
label_column = 'quality'
sns.set(style="whitegrid")  # Set the style for seaborn

# Count the occurrences of each unique value in the label column
label_counts = wine_df['quality'].value_counts()

# Create a bar plot
plt.figure(figsize=(10, 6))  # Adjust the figure size if needed
ax = sns.barplot(x=label_counts.index, y=label_counts.values, palette="viridis")

# Add labels and title
plt.xlabel('Label')
plt.ylabel('Count')
plt.title('Distribution of ' + label_column)




# Show the plot
plt.show()

wine_df['quality'].value_counts()

# Clean up all na and process all catergorial data

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
#I only want to encode the first column
X_wine['type'] = label_encoder.fit_transform(X_wine['type'])
X_wine.head(-5)

wine_df_cleaned = pd.concat([X_wine, y_wine], axis=1)
wine_df_cleaned.head(-5)
wine_df = wine_df_cleaned
wine_df.fillna(0, inplace=True)

wine_df

print(wine_df.isna().sum())

wine_df.head()



In [None]:
from sklearn.preprocessing import StandardScaler , MinMaxScaler

# Create a StandardScaler object
#normalize the data

y_wine = wine_df['quality']
x_wine = wine_df.drop(['quality'], axis=1)


#I want to add y_wine to the last column of X_wine_standard
# Wine_standard['quality'] = y_wine
Wine_standard = x_wine


#compute the ratio of the free_sulfur_dioxide to total_sulfur_dioxide and call the column ratio_sulfur_dioxide
Wine_standard['ratio_sulfur_dioxide'] = Wine_standard['free_sulfur_dioxide']/Wine_standard['total_sulfur_dioxide']

#compute the skew of the features and call the column skewness
Wine_standard['skewness'] = Wine_standard.skew(axis = 1, skipna = True)

#compute the kurtosis of the features and call the column kurtosis
Wine_standard['kurtosis'] = Wine_standard.kurtosis(axis = 1, skipna = True)

#compute the mean of the features and call the column mean
Wine_standard['mean'] = Wine_standard.mean(axis = 1, skipna = True)

#compute the median of the features and call the column median
Wine_standard['median'] = Wine_standard.median(axis = 1, skipna = True)

#compute the variance of the features and call the column variance
Wine_standard['variance'] = Wine_standard.var(axis = 1, skipna = True)

#compute the standard deviation of the features and call the column std
Wine_standard['std'] = Wine_standard.std(axis = 1, skipna = True)

#compute the coefficient of variation of the features and call the column cv
Wine_standard['cv'] = Wine_standard.std(axis = 1, skipna = True)/Wine_standard.mean(axis = 1, skipna = True)

#compute the range of the features and call the column range
Wine_standard['range'] = Wine_standard.max(axis = 1, skipna = True) - Wine_standard.min(axis = 1, skipna = True)

#compute the interquartile range of the features and call the column iqr
Wine_standard['iqr'] = Wine_standard.quantile(q=0.75, axis=1) - Wine_standard.quantile(q=0.25, axis=1)
#compute the covariance of the features and call the column covariance

scaler = StandardScaler()

Wine = scaler.fit(Wine_standard)
new_Wine = Wine.transform(Wine_standard)
Wine_upload = pd.DataFrame(new_Wine, columns=Wine_standard.columns)

Wine_upload

Wine_upload.head(6400)




## Test train a model
Test train a RandomForestClassifier to se the score

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(Wine_upload, y_wine, test_size=0.2)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
#creating RandomForestClassifier constructor
rnd = RandomForestClassifier()
fit_rnd = rnd.fit(X_train,y_train)
rnd_score = rnd.score(X_test,y_test)
print("Random Forest Classifier Score: ",rnd_score)

In [None]:
from sklearn.decomposition import PCA
import numpy as np
n = 20
pca = PCA(n_components=n) # doing pca and keeping only n_components
pca = pca.fit(x) # the correct dimension of X for sklearn is P*N (samples*features)
X_pca_skl = pca.transform(x)
#X2 = pca.inverse_transform(X_pca_skl)
import matplotlib.pyplot as plt
plt.bar(range(0,n), pca.explained_variance_ratio_, label="individual var");
plt.step(range(0,n), np.cumsum(pca.explained_variance_ratio_),'r', label="cumulative var");
plt.xlabel('Principal component index'); plt.ylabel('explained variance ratio %');
plt.legend()
print(X_pca_skl.shape)
print(x.shape)
#print(X1)
#print(X1-X2)

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=40)



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import svm, datasets
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix




def model_evaluation(model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train.values.ravel())
    predictions = model.predict(x_test)
    print("Accuracy score: %.2f" % accuracy_score(y_test, predictions))
    print(confusion_matrix(y_test, predictions))
    score = model.score(x_test, y_test)

    cm = metrics.confusion_matrix(y_test, predictions)
    print(cm)

    plt.figure(figsize=(4, 4))
    sns.heatmap(cm, annot=True, fmt=".0f", linewidths=.5,
                square=True, cmap='Blues_r')
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    all_sample_title = 'Accuracy Score: {0}'.format(score)
    plt.title(all_sample_title, size=10)
    return score

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model_evaluation(model, x_train, x_test, y_train, y_test)

In [None]:
#LogisticRegression with max iterations
from sklearn.linear_model import SGDClassifier
model = LogisticRegression(max_iter=10000)
model = SGDClassifier()
model = svm.SVC()
model_evaluation(model, x_train, x_test, y_train, y_test)

In [None]:
wine_df.info()
wine_df.head()

In [None]:
Wine_upload['quality'] = y_wine
Wine_upload.info()
# I want to split the dataframe into two halvs and upload them to the feature store
Wine_upload1 = Wine_upload.iloc[0:3200]
Wine_upload2 = Wine_upload.iloc[3200:6496]
print(Wine_upload1.info())
print(Wine_upload2.info())
# wine_fg = fs.get_or_create_feature_group(name='winequality1'
#                                            ,version='1'
#                                            ,primary_key=['type','fixed_acidity','volatile_acidity','citric_acid','residual_sugar','chlorides','free_sulfur_dioxide','total_sulfur_dioxide','density','ph','sulphates','alcohol','ratio_sulfur_dioxide','skewness','kurtosis','mean','median','variance','std','cv','range','iqr']
#                                             ,description='wine quality')
# wine_fg.insert(Wine_upload1)
# wine_fg = fs.get_or_create_feature_group(name='winequality2'
#                                            ,version='1'
#                                            ,primary_key=['type','fixed_acidity','volatile_acidity','citric_acid','residual_sugar','chlorides','free_sulfur_dioxide','total_sulfur_dioxide','density','ph','sulphates','alcohol','ratio_sulfur_dioxide','skewness','kurtosis','mean','median','variance','std','cv','range','iqr']
#                                             ,description='wine quality')
# wine_fg.insert(Wine_upload2)


In [None]:
Wine_upload['quality'] = y_wine
Wine_upload.info()
wine_fg = fs.get_or_create_feature_group(name='winequality'
                                           ,version='10'
                                           ,primary_key=['type','fixed_acidity','volatile_acidity','citric_acid','residual_sugar','chlorides','free_sulfur_dioxide','total_sulfur_dioxide','density','ph','sulphates','alcohol']
                                            ,description='wine quality')
wine_fg.insert(Wine_upload)