In [36]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.metrics import f1_score,accuracy_score,recall_score,precision_score

In [37]:
X, y = make_classification(
    n_samples=3000,
    n_features=50,         # High dimensional space
    n_informative=10,
    n_redundant=10,
    n_repeated=5,
    n_clusters_per_class=6, # Multiple clusters per class
    class_sep=0.8,          # Less separation → harder classification
    flip_y=0.1,             # 10% label noise
    weights=[0.6, 0.4],     # Class imbalance
    random_state=42
)

In [38]:
# cross validation
from sklearn.model_selection import cross_val_score,KFold
model = LogisticRegression(max_iter=1000)
kf = KFold(n_splits=10,shuffle=True,random_state=42)

In [39]:
for train_index,test_index in kf.split(X,y):
    print("training index:",train_index)
    print("testing index:",test_index)

training index: [   1    2    3 ... 2997 2998 2999]
testing index: [   0   14   30   32   44   45   51   52   63   67   80   88   93  102
  124  139  141  144  152  162  174  175  177  178  184  192  194  196
  203  211  246  251  254  256  266  270  289  291  296  298  314  321
  322  331  354  368  393  402  408  414  416  423  436  443  450  457
  463  471  472  478  479  486  495  506  511  521  532  533  535  555
  567  568  598  605  612  644  685  695  718  727  741  746  755  761
  765  772  781  783  787  794  798  817  842  844  845  857  862  879
  881  900  912  927  929  942  965  978  998 1001 1005 1025 1027 1034
 1044 1057 1064 1073 1078 1084 1090 1094 1103 1106 1117 1123 1174 1190
 1195 1200 1211 1222 1226 1231 1241 1261 1263 1268 1269 1270 1272 1298
 1299 1316 1321 1330 1359 1360 1362 1364 1374 1393 1411 1412 1429 1442
 1491 1502 1547 1551 1556 1567 1582 1583 1586 1600 1607 1614 1641 1665
 1670 1691 1694 1706 1710 1725 1732 1746 1752 1760 1762 1763 1770 1793
 1798 1801

In [40]:
cv_scores = cross_val_score(model,X,y,cv = kf,scoring="accuracy")
cv_scores = np.round(cv_scores,4)
print("cross validation scores")
for cv_score in cv_scores:
    print(cv_score)

print("average: " ,np.mean(cv_scores))    
print("std",np.std(cv_scores))

cross validation scores
0.6467
0.71
0.6233
0.62
0.65
0.68
0.65
0.6667
0.6433
0.6433
average:  0.6533300000000001
std 0.025261751720733853


In [41]:
from sklearn.model_selection import StratifiedKFold
skfold_validator = StratifiedKFold(n_splits=10)
skcv_results = cross_val_score(model,X,y,cv = skfold_validator)



In [43]:
print("stratified cross validation scores")
for item in np.round(skcv_results,4):
    print(item)

print("average: " ,np.mean(skcv_results))    
print("std",np.std(skcv_results))

stratified cross validation scores
0.6567
0.64
0.66
0.6633
0.6167
0.71
0.6067
0.6667
0.6867
0.6633
average:  0.657
std 0.028846336181759896


### hyper parameter tuning

#### gridsearch

In [54]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

parameters = {
    'max_depth' : [10,50,60],
    'criterion' : ['gini','entropy'],
    'n_estimators' : [50,100,200]
}

grid_search = GridSearchCV(
    RandomForestClassifier(),
    parameters,
    cv = 5,
    scoring='accuracy',
    verbose=1,
    n_jobs=1
)

In [55]:
from sklearn.datasets import make_classification

X, y = make_classification(
    n_samples=2000,       # fewer samples than before
    n_features=20,        # lower dimensionality
    n_informative=5,      # fewer informative features
    n_redundant=2,
    n_clusters_per_class=2, 
    class_sep=2.0,        # higher separation -> easier classification
    flip_y=0.01,          # very low label noise
    weights=[0.5, 0.5],   # balanced classes
    random_state=42
)

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)


In [56]:
grid_search.fit(X_train,y_train)


Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [None]:
print("best paramter",grid_search.best_params_)
print("best score",grid_search.best_score_)

best paramter {'criterion': 'entropy', 'max_depth': 50, 'n_estimators': 100}
besr score 0.9826666666666666


In [53]:
from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(),
    param_distributions=parameters,
    verbose=1,
    n_jobs=1,
    n_iter=18
)

random_search.fit(X_train,y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [57]:
print("best score:",random_search.best_score_)
print("best params:",random_search.best_params_)

best score: 0.9826666666666666
best params: {'n_estimators': 50, 'max_depth': 10, 'criterion': 'gini'}


In [None]:
# import pandas as pd
# data = pd.read_csv('beer-servings.csv')
# data = data.iloc[:,1:]
# data.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0.0,0.0,0.0,0.0,Asia
1,Albania,89.0,132.0,54.0,4.9,Europe
2,Algeria,25.0,0.0,14.0,0.7,Africa
3,Andorra,245.0,138.0,312.0,12.4,Europe
4,Angola,217.0,57.0,45.0,5.9,Africa


In [None]:
# def replace_outliers(df,column_name):
#     q1 = df[column_name].quantile(0.25)
#     q3 = df[column_name].quantile(0.75)
#     iqr = q3-q1
#     lower_bond = q1 - 1.5*iqr
#     upper_bond = q3 + 1.5*iqr
#     df[column_name] = df[column_name].clip(lower = lower_bond,upper = upper_bond)
#     return df[column_name]



# numeric_columns = data.select_dtypes(include='number').columns.to_list()
# numeric_columns.remove('total_litres_of_pure_alcohol')
# for col in numeric_columns:
#     data[col] = replace_outliers(data,col)


In [None]:
# data = data.dropna(subset = ['total_litres_of_pure_alcohol'])

# #split features and target
# y = data['total_litres_of_pure_alcohol']
# X = data.drop('total_litres_of_pure_alcohol',axis = 1)

# X.head()
# y.head()

0     0.0
1     4.9
2     0.7
3    12.4
4     5.9
Name: total_litres_of_pure_alcohol, dtype: float64

In [None]:
# from sklearn.pipeline import Pipeline
# from sklearn.compose import ColumnTransformer
# from sklearn.impute import SimpleImputer

In [None]:
# numeric_features = X.select_dtypes(include = 'number').columns.to_list()
# categorical_features = X.select_dtypes(include = 'object').columns.to_list()

In [None]:
# from sklearn.preprocessing import PolynomialFeatures
# from sklearn.preprocessing import StandardScaler,OneHotEncoder
# # pipelines

# numeric_transormer = Pipeline(steps=[
#     ('imputer',SimpleImputer(strategy='median')),
#     ('poly',PolynomialFeatures(degree=2,include_bias=False)),   #to make this to polymomial regression
#     ('scaler',StandardScaler())
# ])

# categorical_transformer = Pipeline(steps = [
#     ('imputer',SimpleImputer(strategy="most_frequent")),
#     ('encoder',OneHotEncoder(handle_unknown='ignore'))
# ])
# # columns transformer

# preprocessor = ColumnTransformer(transformers=[
#     ('num',numeric_transormer,numeric_features),
#     ('cat',categorical_transformer,categorical_features)
# ])


In [None]:
# from sklearn.linear_model import LinearRegression
# model_pipeline = Pipeline(steps = [
#     ('preprocessor',preprocessor),
#     ('regressor',LinearRegression())
# ])


In [None]:

# model = LinearRegression()
# kf = KFold(n_splits=10,shuffle=True)
# cv_scores = cross_val_score(model,X,y,cv = kf,scoring="accuracy")
# cv_scores = np.round(cv_scores,4)
# print("cross validation scores")
# for cv_score in cv_scores:
#     print(cv_score)

# print("average: " ,np.mean(cv_scores))    
# print("std",np.std(cv_scores))

ValueError: 
All the 10 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "c:\AI_ML\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\AI_ML\.venv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\AI_ML\.venv\Lib\site-packages\sklearn\linear_model\_base.py", line 601, in fit
    X, y = validate_data(
           ~~~~~~~~~~~~~^
        self,
        ^^^^^
    ...<5 lines>...
        force_writeable=True,
        ^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\AI_ML\.venv\Lib\site-packages\sklearn\utils\validation.py", line 2961, in validate_data
    X, y = check_X_y(X, y, **check_params)
           ~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\AI_ML\.venv\Lib\site-packages\sklearn\utils\validation.py", line 1370, in check_X_y
    X = check_array(
        X,
    ...<12 lines>...
        input_name="X",
    )
  File "c:\AI_ML\.venv\Lib\site-packages\sklearn\utils\validation.py", line 1055, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "c:\AI_ML\.venv\Lib\site-packages\sklearn\utils\_array_api.py", line 839, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "c:\AI_ML\.venv\Lib\site-packages\pandas\core\generic.py", line 2168, in __array__
    arr = np.asarray(values, dtype=dtype)
ValueError: could not convert string to float: 'Afghanistan'

--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\AI_ML\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\AI_ML\.venv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\AI_ML\.venv\Lib\site-packages\sklearn\linear_model\_base.py", line 601, in fit
    X, y = validate_data(
           ~~~~~~~~~~~~~^
        self,
        ^^^^^
    ...<5 lines>...
        force_writeable=True,
        ^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\AI_ML\.venv\Lib\site-packages\sklearn\utils\validation.py", line 2961, in validate_data
    X, y = check_X_y(X, y, **check_params)
           ~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\AI_ML\.venv\Lib\site-packages\sklearn\utils\validation.py", line 1370, in check_X_y
    X = check_array(
        X,
    ...<12 lines>...
        input_name="X",
    )
  File "c:\AI_ML\.venv\Lib\site-packages\sklearn\utils\validation.py", line 1055, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "c:\AI_ML\.venv\Lib\site-packages\sklearn\utils\_array_api.py", line 839, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "c:\AI_ML\.venv\Lib\site-packages\pandas\core\generic.py", line 2168, in __array__
    arr = np.asarray(values, dtype=dtype)
ValueError: could not convert string to float: 'Albania'
