In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier


from sklearn.datasets import load_iris
from sklearn.datasets import fetch_openml
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay


In [None]:

# Load the Palmer Penguins dataset
penguins = sns.load_dataset("penguins")
penguins.dropna(inplace=True)
penguins.head()
X = penguins[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']]
y = penguins['species']


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)




In [None]:

# Set up the classifiers to use
names = [
    "Nearest Neighbors",
    "Linear SVM",
    "Decision Tree",
    "Random Forest",
    "Neural Net"
]
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000)
]

In [None]:
# Set up the plot
figure = plt.figure(figsize=(20, 20))

# iterate over classifiers
for i, (name, clf) in enumerate(zip(names, classifiers)):
    ax = plt.subplot(1, len(classifiers) + 1, i+1)

    clf = make_pipeline(StandardScaler(), clf)
    clf.fit(X_train, y_train)

    # Plot the confusion matrix
    y_pred = clf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    cm_display = ConfusionMatrixDisplay(cm, display_labels=np.unique(y)).plot(ax=ax, cmap=plt.cm.Blues, xticks_rotation='vertical')

    ax.set_title(name)
    ax.set_xlabel('Predicted label')
    ax.set_ylabel('True label')
    
plt.tight_layout()
plt.show()


In [None]:
#save dataframe to csv
penguins.to_csv('penguins.csv', index=False)


In [None]:
#print variable list    
print(penguins.columns)

In [None]:
#describe the variables
penguins.describe()

In [None]:
#pivot table
pd.pivot_table(penguins, index=['species', 'island'], values=['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g'], aggfunc=np.mean)





In [None]:
# group by species
penguins.groupby('species').count()

In [None]:
#violin plot
sns.violinplot(x="species", y="bill_length_mm", data=penguins)


In [None]:
#violin plot in plotly  
import plotly.express as px
px.violin(penguins, y="bill_length_mm", x="species", box=True, points="all", hover_data=penguins.columns)


In [None]:
# run a regression of body mass on species sex and island
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.formula.api import ols







In [None]:
#set up the model
formula = 'body_mass_g ~ species + sex + island'
model1 = ols(formula, data=penguins).fit() 
print(model1.summary())



In [None]:
#trying with regularization?
model1r = ols(formula, data=penguins).fit_regularized() 


In [None]:
model2 = smf.glm(formula=formula, data=penguins, family=sm.families.Gaussian()).fit()
#explain smf gaussian
#https://www.statsmodels.org/stable/generated/statsmodels.genmod.families.family.Gaussian.html
model2.summary()

In [None]:
model3 = smf.glm(formula=formula, data=penguins, family=sm.families.Binomial()).fit()
#explain smf binomial
#https://www.statsmodels.org/stable/generated/statsmodels.genmod.families.family.Binomial.html
model3.summary()

In [None]:
model4 = smf.glm(formula=formula, data=penguins, family=sm.families.Poisson()).fit()
#explain smf poisson
#https://www.statsmodels.org/stable/generated/statsmodels.genmod.families.family.Poisson.html
model4.summary()

In [None]:

#show data types
penguins.dtypes



In [None]:
#change to integer
