In [37]:
from google.colab import files
import pandas as pd


# Load into a DataFrame
df = pd.read_csv("nba.csv")
df.head()  # View the first few rows


Unnamed: 0,Name,GP,MIN,PPT,FGM,FGA,FG%,3PM,3PA,3P%,...,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TAR
0,Brandon Ingram,36,27.4,7.4,2.6,7.6,34.7,0.5,2.1,25.0,...,2.3,69.9,0.7,3.4,4.1,1.9,0.4,0.4,1.3,0
1,Andrew Harrison,35,26.9,7.2,2.0,6.7,29.6,0.7,2.8,23.5,...,3.4,76.5,0.5,2.0,2.4,3.7,1.1,0.5,1.6,0
2,JaKarr Sampson,74,15.3,5.2,2.0,4.7,42.2,0.4,1.7,24.4,...,1.3,67.0,0.5,1.7,2.2,1.0,0.5,0.3,1.0,0
3,Malik Sealy,58,11.6,5.7,2.3,5.5,42.6,0.1,0.5,22.6,...,1.3,68.9,1.0,0.9,1.9,0.8,0.6,0.1,1.0,1
4,Matt Geiger,48,11.5,4.5,1.6,3.0,52.4,0.0,0.1,0.0,...,1.9,67.4,1.0,1.5,2.5,0.3,0.3,0.4,0.8,1


In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1340 entries, 0 to 1339
Data columns (total 21 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    1340 non-null   object 
 1   GP      1340 non-null   int64  
 2   MIN     1340 non-null   float64
 3   PPT     1340 non-null   float64
 4   FGM     1340 non-null   float64
 5   FGA     1340 non-null   float64
 6   FG%     1340 non-null   float64
 7   3PM     1340 non-null   float64
 8   3PA     1340 non-null   float64
 9   3P%     1329 non-null   float64
 10  FTM     1340 non-null   float64
 11  FTA     1340 non-null   float64
 12  FT%     1340 non-null   float64
 13  OREB    1340 non-null   float64
 14  DREB    1340 non-null   float64
 15  REB     1340 non-null   float64
 16  AST     1340 non-null   float64
 17  STL     1340 non-null   float64
 18  BLK     1340 non-null   float64
 19  TOV     1340 non-null   float64
 20  TAR     1340 non-null   int64  
dtypes: float64(18), int64(2), object(1)
m

In [39]:
print(df.isnull().sum())
df.dropna(inplace=True)

Name     0
GP       0
MIN      0
PPT      0
FGM      0
FGA      0
FG%      0
3PM      0
3PA      0
3P%     11
FTM      0
FTA      0
FT%      0
OREB     0
DREB     0
REB      0
AST      0
STL      0
BLK      0
TOV      0
TAR      0
dtype: int64


In [40]:
print(df.dtypes)


Name     object
GP        int64
MIN     float64
PPT     float64
FGM     float64
FGA     float64
FG%     float64
3PM     float64
3PA     float64
3P%     float64
FTM     float64
FTA     float64
FT%     float64
OREB    float64
DREB    float64
REB     float64
AST     float64
STL     float64
BLK     float64
TOV     float64
TAR       int64
dtype: object


In [41]:
missing_percent = df.isnull().mean() * 100
print(missing_percent)


Name    0.0
GP      0.0
MIN     0.0
PPT     0.0
FGM     0.0
FGA     0.0
FG%     0.0
3PM     0.0
3PA     0.0
3P%     0.0
FTM     0.0
FTA     0.0
FT%     0.0
OREB    0.0
DREB    0.0
REB     0.0
AST     0.0
STL     0.0
BLK     0.0
TOV     0.0
TAR     0.0
dtype: float64


In [42]:
print(df['3P%'].skew())

0.2898780177258843


In [43]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Drop 'Name' column
df.drop(columns=['Name'], inplace=True)

# Separate features and target
X = df.drop(columns=['TAR'])
y = df['TAR']

# Normalize features (optional for tree-based models)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [44]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [45]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

knn_params = {'n_neighbors': [3, 5, 7, 9]}
knn = GridSearchCV(KNeighborsClassifier(), knn_params, cv=10, scoring='f1')
knn.fit(X_train, y_train)

print(f"Best KNN Params: {knn.best_params_}")


Best KNN Params: {'n_neighbors': 9}


In [46]:
from sklearn.ensemble import RandomForestClassifier

rf_params = {'n_estimators': [50, 100, 200], 'max_depth': [5, 10, None]}
rf = GridSearchCV(RandomForestClassifier(), rf_params, cv=10, scoring='f1')
rf.fit(X_train, y_train)

print(f"Best RF Params: {rf.best_params_}")


Best RF Params: {'max_depth': 5, 'n_estimators': 50}


In [47]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Define a valid parameter grid with correct solver-penalty pairs
param_grid = [
    {'penalty': ['l1', 'l2'], 'solver': ['liblinear'], 'max_iter': [2000]},
    {'penalty': ['l1', 'l2', 'elasticnet'], 'solver': ['saga'], 'l1_ratio': [0.1, 0.5, 0.9], 'max_iter': [2000]}
]

# Initialize GridSearchCV
lr = GridSearchCV(
    LogisticRegression(),
    param_grid,
    cv=10,
    scoring='f1',
    n_jobs=-1,  # Use all CPU cores
    verbose=1
)

# Train the model
lr.fit(X_train, y_train)

# Output best parameters
print(f"Best LR Params: {lr.best_params_}")


Fitting 10 folds for each of 11 candidates, totalling 110 fits
Best LR Params: {'l1_ratio': 0.5, 'max_iter': 2000, 'penalty': 'elasticnet', 'solver': 'saga'}


In [48]:
from sklearn.metrics import f1_score

models = {'KNN': knn.best_estimator_, 'Random Forest': rf.best_estimator_,
          'Logistic Regression': lr.best_estimator_, }

for name, model in models.items():
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    print(f"{name} F1 Score: {f1:.4f}")


KNN F1 Score: 0.8023
Random Forest F1 Score: 0.8023
Logistic Regression F1 Score: 0.8011


In [49]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

# Load dataset (replace with your dataset)
data = load_iris()
X, y = data.data, data.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid for GridSearchCV
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],  # Different layer configurations
    'activation': ['tanh', 'relu'],                  # Activation functions
    'solver': ['sgd', 'adam'],                       # Optimization algorithms
    'alpha': [0.0001, 0.05],                         # Regularization parameter
    'learning_rate': ['constant', 'adaptive'],       # Learning rate strategies
}

# Initialize the MLPClassifier
mlp = MLPClassifier(max_iter=2000, random_state=42)

# Initialize GridSearchCV with 3-fold cross-validation
an = GridSearchCV(estimator=mlp, param_grid=param_grid, scoring='f1_macro', cv=3, n_jobs=-1, verbose=2)

# Fit GridSearchCV
an.fit(X_train_scaled, y_train)

# Print the best parameters and best score
print(f"Best parameters found: {an.best_params_}")
print(f"Best cross-validation F1 score: {an.best_score_:.4f}")



Fitting 3 folds for each of 48 candidates, totalling 144 fits
Best parameters found: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50,), 'learning_rate': 'constant', 'solver': 'adam'}
Best cross-validation F1 score: 0.9581
