In [3]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import cross_val_score, KFold
import pandas as pd
import numpy as np

df = pd.read_csv("regressao_Q1.csv")

# Prepare the features and target variable
X = df.drop('target', axis=1)
y = df['target']

# Create the ElasticNet model with specified parameters
model = ElasticNet(alpha=1.0, l1_ratio=0.01, random_state=0)

# Prepare cross-validation with 5 folds
kf = KFold(n_splits=5, shuffle=False)

# Perform cross-validation and calculate the mean squared error
scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')

# The cross_val_score function returns negative mean squared errors, we need to make them positive
mse_scores = -scores

# Calculate the mean of MSE scores for both training and validation sets
mean_mse = np.mean(mse_scores)

# We cannot exactly calculate the sum of squared errors directly from the mean of MSE because
# the number of samples in each fold might be different if the dataset size is not divisible by 5.
# Therefore, we will perform cross-validation manually to get the sum of squared errors for each fold.

# Lists to store sum of squared errors for each fold
train_sse = []
val_sse = []

for train_index, val_index in kf.split(X):
    # Split data into training and validation sets
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # Fit the model on the training data
    model.fit(X_train, y_train)
    
    # Calculate the sum of squared errors for training and validation sets
    train_sse.append(np.sum((model.predict(X_train) - y_train)**2))
    val_sse.append(np.sum((model.predict(X_val) - y_val)**2))

# Calculate the average of the sum of squared errors for both sets
mean_train_sse = np.mean(train_sse)
mean_val_sse = np.mean(val_sse)

mean_mse, mean_train_sse, mean_val_sse


(0.2691857437994568, 644.6406810715571, 161.51144627967406)

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score

df_classification = pd.read_csv("classificacao_Q2.csv")
# Prepare the features and target variable
X_class = df_classification.drop('target', axis=1)
y_class = df_classification['target']

# Create the Logistic Regression model with L2 regularization
model_class = LogisticRegression(C=0.1, penalty='l2', solver='liblinear', random_state=0)

# Prepare stratified cross-validation with 10 folds
cv_class = StratifiedKFold(n_splits=10, shuffle=False)

# Perform cross-validation and calculate AUC
auc_scores = cross_val_score(model_class, X_class, y_class, cv=cv_class, scoring='roc_auc')

# Calculate the mean AUC scores for both training and validation sets
mean_auc = np.mean(auc_scores)

# To get the AUC for training sets, we have to fit the model on each fold and calculate it
train_auc_scores = []
valid_auc_scores = []
for train_index, val_index in cv_class.split(X_class, y_class):
    # Split data into training and validation sets
    X_train, X_val = X_class.iloc[train_index], X_class.iloc[val_index]
    y_train, y_val = y_class.iloc[train_index], y_class.iloc[val_index]
    
    # Fit the model on the training data
    model_class.fit(X_train, y_train)
    
    # Calculate the AUC on the training set
    train_auc_scores.append(roc_auc_score(y_train, model_class.predict_proba(X_train)[:, 1]))
    valid_auc_scores.append(roc_auc_score(y_val, model_class.predict_proba(X_val)[:, 1]))


mean_train_auc = np.mean(train_auc_scores)
mean_valid_auc_scores = np.mean(valid_auc_scores)

mean_auc, mean_train_auc, mean_valid_auc_scores


(0.876851378187404, 0.8790619764973195, 0.876851378187404)

In [None]:
mean_train_auc = np.mean(train_auc_scores)

mean_auc, mean_train_auc

In [14]:
import numpy as np
X=np.random.normal(2.4,0.04,100)
d_9 = np.percentile(X,50)
d_9

2.396551281458314

In [15]:
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score

# Load the data
df = pd.read_csv('agrupamento.csv')

# Since we don't know the exact features, we'll assume the entire data is to be used for clustering
X = df

# Dictionary to store the silhouette scores for different number of clusters
silhouette_scores = {}

# Testing cluster sizes from 2 to 5
for n_clusters in range(2, 6):
    clustering = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='average')
    cluster_labels = clustering.fit_predict(X)
    silhouette_avg = silhouette_score(X, cluster_labels)
    silhouette_scores[n_clusters] = silhouette_avg

# Find the number of clusters with the highest silhouette score
best_n_clusters = max(silhouette_scores, key=silhouette_scores.get)
best_silhouette_score = silhouette_scores[best_n_clusters]

best_n_clusters, best_silhouette_score, silhouette_scores




(4,
 0.589374033016901,
 {2: 0.5199002748643055,
  3: 0.5663652051962086,
  4: 0.589374033016901,
  5: 0.5725689189611636})