In [1]:
#import all the library
import pandas as pd
import pickle
import numpy as np
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from matplotlib import pyplot as plt
from sklearn.svm import SVC

In [2]:
#reading the dataset
df = pd.read_csv("PCOS_data.csv")
df = df.fillna(0)
print(df)

     Sl. No  Patient File No.  PCOS (Y/N)   Age (yrs)  Weight (Kg)  \
0         1                 1           0          28         44.6   
1         2                 2           0          36         65.0   
2         3                 3           1          33         68.8   
3         4                 4           0          37         65.0   
4         5                 5           0          25         52.0   
..      ...               ...         ...         ...          ...   
536     537               537           0          35         50.0   
537     538               538           0          30         63.2   
538     539               539           0          36         54.0   
539     540               540           0          27         50.0   
540     541               541           1          23         82.0   

     Height(Cm)    BMI  Blood Group  Pulse rate(bpm)   RR (breaths/min)  ...  \
0        152.000  19.3           15                78                22  ...   

In [3]:
df.dtypes

Sl. No                      int64
Patient File No.            int64
PCOS (Y/N)                  int64
 Age (yrs)                  int64
Weight (Kg)               float64
Height(Cm)                float64
BMI                       float64
Blood Group                 int64
Pulse rate(bpm)             int64
RR (breaths/min)            int64
Hb(g/dl)                  float64
Cycle(R/I)                  int64
Cycle length(days)          int64
Marraige Status (Yrs)     float64
Pregnant(Y/N)               int64
No. of abortions            int64
  I   beta-HCG(mIU/mL)    float64
II    beta-HCG(mIU/mL)     object
FSH(mIU/mL)               float64
LH(mIU/mL)                float64
FSH/LH                    float64
Hip(inch)                   int64
Waist(inch)                 int64
Waist:Hip Ratio           float64
TSH (mIU/L)               float64
AMH(ng/mL)                 object
PRL(ng/mL)                float64
Vit D3 (ng/mL)            float64
PRG(ng/mL)                float64
RBS(mg/dl)    

In [4]:
df.columns

Index(['Sl. No', 'Patient File No.', 'PCOS (Y/N)', ' Age (yrs)', 'Weight (Kg)',
       'Height(Cm) ', 'BMI', 'Blood Group', 'Pulse rate(bpm) ',
       'RR (breaths/min)', 'Hb(g/dl)', 'Cycle(R/I)', 'Cycle length(days)',
       'Marraige Status (Yrs)', 'Pregnant(Y/N)', 'No. of abortions',
       '  I   beta-HCG(mIU/mL)', 'II    beta-HCG(mIU/mL)', 'FSH(mIU/mL)',
       'LH(mIU/mL)', 'FSH/LH', 'Hip(inch)', 'Waist(inch)', 'Waist:Hip Ratio',
       'TSH (mIU/L)', 'AMH(ng/mL)', 'PRL(ng/mL)', 'Vit D3 (ng/mL)',
       'PRG(ng/mL)', 'RBS(mg/dl)', 'Weight gain(Y/N)', 'hair growth(Y/N)',
       'Skin darkening (Y/N)', 'Hair loss(Y/N)', 'Pimples(Y/N)',
       'Fast food (Y/N)', 'Reg.Exercise(Y/N)', 'BP _Systolic (mmHg)',
       'BP _Diastolic (mmHg)', 'Follicle No. (L)', 'Follicle No. (R)',
       'Avg. F size (L) (mm)', 'Avg. F size (R) (mm)', 'Endometrium (mm)',
       'Unnamed: 44'],
      dtype='object')

In [5]:
#removing the unnamed column
import pandas as pd

# Step 1: Read the CSV file into a DataFrame
df = pd.read_csv('PCOS_data.csv')

# Step 2: Drop columns that start with 'Unnamed'
df = df.loc[:, ~df.columns.str.startswith('Unnamed')]

# Step 3: Save back to a new CSV file (optional)
df.to_csv('cleaned_file.csv', index=False)


In [6]:
#to convert objects or string into float
import pandas as pd
import re

# Step 1: Read the CSV file into a DataFrame
df = pd.read_csv('PCOS_data.csv')

# Step 2: Clean the column by removing any unwanted characters
# Replace 'column_name' with the name of your column
df['II    beta-HCG(mIU/mL)'] = df['II    beta-HCG(mIU/mL)'].apply(lambda x: re.sub(r'[^0-9.-]', '', str(x)))
df['AMH(ng/mL)'] = df['AMH(ng/mL)'].apply(lambda x: re.sub(r'[^0-9.-]', '', str(x)))

# Step 3: Convert the cleaned column to float
df['II    beta-HCG(mIU/mL)'] = pd.to_numeric(df['II    beta-HCG(mIU/mL)'], errors='coerce')
df['AMH(ng/mL)'] = pd.to_numeric(df['AMH(ng/mL)'], errors='coerce')

# Step 4: Check for conversion issues
print(df['II    beta-HCG(mIU/mL)'].isnull().sum(), "rows have been converted to NaN due to non-numeric values.")
print(df['AMH(ng/mL)'].isnull().sum(), "rows have been converted to NaN due to non-numeric values.")

# Step 5: Save back to a new CSV file
df.to_csv('converted_file.csv', index=False)


1 rows have been converted to NaN due to non-numeric values.
1 rows have been converted to NaN due to non-numeric values.


In [7]:
#fix the error
value = '1.99.'
try:
    # Remove any trailing periods
    value = value.strip().replace('.', '', value.count('.') - 1)
    float_value = float(value)
    print(float_value)
except ValueError as e:
    print(f"Error converting to float: {e}")

199.0


In [8]:
df.dtypes

Sl. No                      int64
Patient File No.            int64
PCOS (Y/N)                  int64
 Age (yrs)                  int64
Weight (Kg)               float64
Height(Cm)                float64
BMI                       float64
Blood Group                 int64
Pulse rate(bpm)             int64
RR (breaths/min)            int64
Hb(g/dl)                  float64
Cycle(R/I)                  int64
Cycle length(days)          int64
Marraige Status (Yrs)     float64
Pregnant(Y/N)               int64
No. of abortions            int64
  I   beta-HCG(mIU/mL)    float64
II    beta-HCG(mIU/mL)    float64
FSH(mIU/mL)               float64
LH(mIU/mL)                float64
FSH/LH                    float64
Hip(inch)                   int64
Waist(inch)                 int64
Waist:Hip Ratio           float64
TSH (mIU/L)               float64
AMH(ng/mL)                float64
PRL(ng/mL)                float64
Vit D3 (ng/mL)            float64
PRG(ng/mL)                float64
RBS(mg/dl)    

In [9]:
#removing the unnamed column
import pandas as pd

# Step 1: Read the CSV file into a DataFrame
df = pd.read_csv('PCOS_data.csv')

# Step 2: Drop columns that start with 'Unnamed'
df = df.loc[:, ~df.columns.str.startswith('Unnamed')]

# Step 3: Save back to a new CSV file (optional)
df.to_csv('cleaned_file.csv', index=False)


In [10]:
df

Unnamed: 0,Sl. No,Patient File No.,PCOS (Y/N),Age (yrs),Weight (Kg),Height(Cm),BMI,Blood Group,Pulse rate(bpm),RR (breaths/min),...,Pimples(Y/N),Fast food (Y/N),Reg.Exercise(Y/N),BP _Systolic (mmHg),BP _Diastolic (mmHg),Follicle No. (L),Follicle No. (R),Avg. F size (L) (mm),Avg. F size (R) (mm),Endometrium (mm)
0,1,1,0,28,44.6,152.000,19.3,15,78,22,...,0,1.0,0,110,80,3,3,18.0,18.0,8.5
1,2,2,0,36,65.0,161.500,24.9,15,74,20,...,0,0.0,0,120,70,3,5,15.0,14.0,3.7
2,3,3,1,33,68.8,165.000,25.3,11,72,18,...,1,1.0,0,120,80,13,15,18.0,20.0,10.0
3,4,4,0,37,65.0,148.000,29.7,13,72,20,...,0,0.0,0,120,70,2,2,15.0,14.0,7.5
4,5,5,0,25,52.0,161.000,20.1,11,72,18,...,0,0.0,0,120,80,3,4,16.0,14.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
536,537,537,0,35,50.0,164.592,18.5,17,72,16,...,0,0.0,0,110,70,1,0,17.5,10.0,6.7
537,538,538,0,30,63.2,158.000,25.3,15,72,18,...,0,0.0,0,110,70,9,7,19.0,18.0,8.2
538,539,539,0,36,54.0,152.000,23.4,13,74,20,...,0,0.0,0,110,80,1,0,18.0,9.0,7.3
539,540,540,0,27,50.0,150.000,22.2,15,74,20,...,1,0.0,0,110,70,7,6,18.0,16.0,11.5


In [11]:
df.dtypes

Sl. No                      int64
Patient File No.            int64
PCOS (Y/N)                  int64
 Age (yrs)                  int64
Weight (Kg)               float64
Height(Cm)                float64
BMI                       float64
Blood Group                 int64
Pulse rate(bpm)             int64
RR (breaths/min)            int64
Hb(g/dl)                  float64
Cycle(R/I)                  int64
Cycle length(days)          int64
Marraige Status (Yrs)     float64
Pregnant(Y/N)               int64
No. of abortions            int64
  I   beta-HCG(mIU/mL)    float64
II    beta-HCG(mIU/mL)     object
FSH(mIU/mL)               float64
LH(mIU/mL)                float64
FSH/LH                    float64
Hip(inch)                   int64
Waist(inch)                 int64
Waist:Hip Ratio           float64
TSH (mIU/L)               float64
AMH(ng/mL)                 object
PRL(ng/mL)                float64
Vit D3 (ng/mL)            float64
PRG(ng/mL)                float64
RBS(mg/dl)    

In [12]:
inputs = df.drop(["PCOS (Y/N)","Sl. No","Patient File No."],axis = 'columns')
#by dropping this columns we can achieve accuracy and they do not reflect in the target

In [13]:
inputs.head(50)     #focus on first 50 rows of Dataframes
inputx = inputs.head(4)    #focus on the 4 rows which forms the subset of Dataframes

In [14]:
inputx
inputy = inputx.tail(2)   #derive last two rows of the subset of Dataframes

In [15]:
print(inputy)    #last two rows result is stored in this variable

    Age (yrs)  Weight (Kg)  Height(Cm)    BMI  Blood Group  Pulse rate(bpm)   \
2          33         68.8        165.0  25.3           11                72   
3          37         65.0        148.0  29.7           13                72   

   RR (breaths/min)  Hb(g/dl)  Cycle(R/I)  Cycle length(days)  ...  \
2                18      11.8           2                   5  ...   
3                20      12.0           2                   5  ...   

   Pimples(Y/N)  Fast food (Y/N)  Reg.Exercise(Y/N)  BP _Systolic (mmHg)  \
2             1              1.0                  0                  120   
3             0              0.0                  0                  120   

  BP _Diastolic (mmHg)  Follicle No. (L)  Follicle No. (R)  \
2                   80                13                15   
3                   70                 2                 2   

   Avg. F size (L) (mm)  Avg. F size (R) (mm)  Endometrium (mm)  
2                  18.0                  20.0              10.0  


In [16]:
target = df["PCOS (Y/N)"]      #focus on the target
targetdum = target.head(4)     #extracts the rows from the target Dataframe
inputsdumy = inputs.head(4)     # first 4 rows from the inputs Dataframe into inputsdumy

In [17]:
inputs    #display the inputs DataFrame

Unnamed: 0,Age (yrs),Weight (Kg),Height(Cm),BMI,Blood Group,Pulse rate(bpm),RR (breaths/min),Hb(g/dl),Cycle(R/I),Cycle length(days),...,Pimples(Y/N),Fast food (Y/N),Reg.Exercise(Y/N),BP _Systolic (mmHg),BP _Diastolic (mmHg),Follicle No. (L),Follicle No. (R),Avg. F size (L) (mm),Avg. F size (R) (mm),Endometrium (mm)
0,28,44.6,152.000,19.3,15,78,22,10.48,2,5,...,0,1.0,0,110,80,3,3,18.0,18.0,8.5
1,36,65.0,161.500,24.9,15,74,20,11.70,2,5,...,0,0.0,0,120,70,3,5,15.0,14.0,3.7
2,33,68.8,165.000,25.3,11,72,18,11.80,2,5,...,1,1.0,0,120,80,13,15,18.0,20.0,10.0
3,37,65.0,148.000,29.7,13,72,20,12.00,2,5,...,0,0.0,0,120,70,2,2,15.0,14.0,7.5
4,25,52.0,161.000,20.1,11,72,18,10.00,2,5,...,0,0.0,0,120,80,3,4,16.0,14.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
536,35,50.0,164.592,18.5,17,72,16,11.00,2,5,...,0,0.0,0,110,70,1,0,17.5,10.0,6.7
537,30,63.2,158.000,25.3,15,72,18,10.80,2,5,...,0,0.0,0,110,70,9,7,19.0,18.0,8.2
538,36,54.0,152.000,23.4,13,74,20,10.80,2,6,...,0,0.0,0,110,80,1,0,18.0,9.0,7.3
539,27,50.0,150.000,22.2,15,74,20,12.00,4,2,...,1,0.0,0,110,70,7,6,18.0,16.0,11.5


In [18]:
inputs.dtypes    #display the data types of the inputs DataFrame

 Age (yrs)                  int64
Weight (Kg)               float64
Height(Cm)                float64
BMI                       float64
Blood Group                 int64
Pulse rate(bpm)             int64
RR (breaths/min)            int64
Hb(g/dl)                  float64
Cycle(R/I)                  int64
Cycle length(days)          int64
Marraige Status (Yrs)     float64
Pregnant(Y/N)               int64
No. of abortions            int64
  I   beta-HCG(mIU/mL)    float64
II    beta-HCG(mIU/mL)     object
FSH(mIU/mL)               float64
LH(mIU/mL)                float64
FSH/LH                    float64
Hip(inch)                   int64
Waist(inch)                 int64
Waist:Hip Ratio           float64
TSH (mIU/L)               float64
AMH(ng/mL)                 object
PRL(ng/mL)                float64
Vit D3 (ng/mL)            float64
PRG(ng/mL)                float64
RBS(mg/dl)                float64
Weight gain(Y/N)            int64
hair growth(Y/N)            int64
Skin darkening

In [19]:
#to convert objects or string into float
import pandas as pd
import re

# Step 1: Read the CSV file into a DataFrame
inputs = pd.read_csv('PCOS_data.csv')

# Step 2: Clean the column by removing any unwanted characters
# Replace 'column_name' with the name of your column
inputs['II    beta-HCG(mIU/mL)'] = inputs['II    beta-HCG(mIU/mL)'].apply(lambda x: re.sub(r'[^0-9.-]', '', str(x)))
inputs['AMH(ng/mL)'] = df['AMH(ng/mL)'].apply(lambda x: re.sub(r'[^0-9.-]', '', str(x)))

# Step 3: Convert the cleaned column to float
inputs['II    beta-HCG(mIU/mL)'] = pd.to_numeric(inputs['II    beta-HCG(mIU/mL)'], errors='coerce')
inputs['AMH(ng/mL)'] = pd.to_numeric(inputs['AMH(ng/mL)'], errors='coerce')

# Step 4: Check for conversion issues
print(inputs['II    beta-HCG(mIU/mL)'].isnull().sum(), "rows have been converted to NaN due to non-numeric values.")
print(inputs['AMH(ng/mL)'].isnull().sum(), "rows have been converted to NaN due to non-numeric values.")

# Step 5: Save back to a new CSV file
inputs.to_csv('converted_file.csv', index=False)


1 rows have been converted to NaN due to non-numeric values.
1 rows have been converted to NaN due to non-numeric values.


In [20]:
inputs.dtypes

Sl. No                      int64
Patient File No.            int64
PCOS (Y/N)                  int64
 Age (yrs)                  int64
Weight (Kg)               float64
Height(Cm)                float64
BMI                       float64
Blood Group                 int64
Pulse rate(bpm)             int64
RR (breaths/min)            int64
Hb(g/dl)                  float64
Cycle(R/I)                  int64
Cycle length(days)          int64
Marraige Status (Yrs)     float64
Pregnant(Y/N)               int64
No. of abortions            int64
  I   beta-HCG(mIU/mL)    float64
II    beta-HCG(mIU/mL)    float64
FSH(mIU/mL)               float64
LH(mIU/mL)                float64
FSH/LH                    float64
Hip(inch)                   int64
Waist(inch)                 int64
Waist:Hip Ratio           float64
TSH (mIU/L)               float64
AMH(ng/mL)                float64
PRL(ng/mL)                float64
Vit D3 (ng/mL)            float64
PRG(ng/mL)                float64
RBS(mg/dl)    

In [21]:
from sklearn.model_selection import train_test_split   #spilt the dataset into train & test part such that we train one set of data and test the other set of data
X_train, X_test, y_train, y_test = train_test_split(inputsdumy,targetdum,test_size=1)   #since testsize=1 it create 3rows of train input features and 1 row of test features
df.dtypes    #display the data-types

Sl. No                      int64
Patient File No.            int64
PCOS (Y/N)                  int64
 Age (yrs)                  int64
Weight (Kg)               float64
Height(Cm)                float64
BMI                       float64
Blood Group                 int64
Pulse rate(bpm)             int64
RR (breaths/min)            int64
Hb(g/dl)                  float64
Cycle(R/I)                  int64
Cycle length(days)          int64
Marraige Status (Yrs)     float64
Pregnant(Y/N)               int64
No. of abortions            int64
  I   beta-HCG(mIU/mL)    float64
II    beta-HCG(mIU/mL)     object
FSH(mIU/mL)               float64
LH(mIU/mL)                float64
FSH/LH                    float64
Hip(inch)                   int64
Waist(inch)                 int64
Waist:Hip Ratio           float64
TSH (mIU/L)               float64
AMH(ng/mL)                 object
PRL(ng/mL)                float64
Vit D3 (ng/mL)            float64
PRG(ng/mL)                float64
RBS(mg/dl)    

In [22]:
import pandas as pd

# Check for non-numeric values in the inputs
non_numeric = inputs[inputs.applymap(lambda x: isinstance(x, str) and not x.replace('.', '', 1).isdigit())]
print(non_numeric)


     Sl. No  Patient File No.  PCOS (Y/N)   Age (yrs)  Weight (Kg)  \
0       NaN               NaN         NaN         NaN          NaN   
1       NaN               NaN         NaN         NaN          NaN   
2       NaN               NaN         NaN         NaN          NaN   
3       NaN               NaN         NaN         NaN          NaN   
4       NaN               NaN         NaN         NaN          NaN   
..      ...               ...         ...         ...          ...   
536     NaN               NaN         NaN         NaN          NaN   
537     NaN               NaN         NaN         NaN          NaN   
538     NaN               NaN         NaN         NaN          NaN   
539     NaN               NaN         NaN         NaN          NaN   
540     NaN               NaN         NaN         NaN          NaN   

     Height(Cm)   BMI  Blood Group  Pulse rate(bpm)   RR (breaths/min)  ...  \
0            NaN  NaN          NaN               NaN               NaN  ...   
1

In [23]:
import numpy as np

# Replace periods with NaN
inputs = inputs.replace('.', np.nan)

# Optionally, fill NaN with a value like 0 or the mean of the column
inputs = inputs.fillna(0)  # or use 
#inputs.fillna(inputs.mean())


In [24]:
# Convert the DataFrame to numeric types (non-convertible values become NaN)
inputs = inputs.apply(pd.to_numeric, errors='coerce')


In [25]:
reg_model = LogisticRegression()   #binary classification of 0 and 1 i.e Y/N
reg_model.fit(inputs, target)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [26]:
# Drop unnecessary columns from both X_train and X_test
X_train = X_train.drop(['PCOS (Y/N)', 'Patient File No.', 'Sl. No', 'Unnamed: 44'], axis=1, errors='ignore')
X_test = X_test.drop(['PCOS (Y/N)', 'Patient File No.', 'Sl. No', 'Unnamed: 44'], axis=1, errors='ignore')


In [27]:
print(X_train.columns)
print(X_test.columns)


Index([' Age (yrs)', 'Weight (Kg)', 'Height(Cm) ', 'BMI', 'Blood Group',
       'Pulse rate(bpm) ', 'RR (breaths/min)', 'Hb(g/dl)', 'Cycle(R/I)',
       'Cycle length(days)', 'Marraige Status (Yrs)', 'Pregnant(Y/N)',
       'No. of abortions', '  I   beta-HCG(mIU/mL)', 'II    beta-HCG(mIU/mL)',
       'FSH(mIU/mL)', 'LH(mIU/mL)', 'FSH/LH', 'Hip(inch)', 'Waist(inch)',
       'Waist:Hip Ratio', 'TSH (mIU/L)', 'AMH(ng/mL)', 'PRL(ng/mL)',
       'Vit D3 (ng/mL)', 'PRG(ng/mL)', 'RBS(mg/dl)', 'Weight gain(Y/N)',
       'hair growth(Y/N)', 'Skin darkening (Y/N)', 'Hair loss(Y/N)',
       'Pimples(Y/N)', 'Fast food (Y/N)', 'Reg.Exercise(Y/N)',
       'BP _Systolic (mmHg)', 'BP _Diastolic (mmHg)', 'Follicle No. (L)',
       'Follicle No. (R)', 'Avg. F size (L) (mm)', 'Avg. F size (R) (mm)',
       'Endometrium (mm)'],
      dtype='object')
Index([' Age (yrs)', 'Weight (Kg)', 'Height(Cm) ', 'BMI', 'Blood Group',
       'Pulse rate(bpm) ', 'RR (breaths/min)', 'Hb(g/dl)', 'Cycle(R/I)',
       'Cyc

In [28]:
#reg_model = LogisticRegression()
#reg_model.fit(X_train, y_train)
#score = reg_model.score(X_test, y_test)
#print("Prediction score is", score * 100)

# Import necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Example dataset
# Ensure `inputs` and `target` contain examples of both classes (e.g., 0s and 1s)
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.1, random_state=42)

# Initialize the Logistic Regression model
logistic_model = LogisticRegression(random_state=42)

# Fit the model to the training data
logistic_model.fit(X_train, y_train)

# Predict the target values for the test set
y_pred = logistic_model.predict(X_test)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred) * 100

# Print the accuracy score
print(f"Prediction accuracy using Logistic Regression is {accuracy:.2f}%")



Prediction accuracy using Logistic Regression is 76.36%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
#Support Vector Machine (SVM) is a supervised machine learning algorithm primarily used for classification tasks[finds hyperplane]

# Import necessary libraries
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Define the SVM model (SV_model)
SV_model = SVC(kernel='linear', C=1.0, random_state=42)

# Drop unnecessary columns from both X_train and inputs
columns_to_drop = ['PCOS (Y/N)', 'Patient File No.', 'Sl. No', 'Unnamed: 44']
X_train = X_train.drop(columns=columns_to_drop, axis=1, errors='ignore')
X_test = X_test.drop(columns=columns_to_drop, axis=1, errors='ignore')
inputs = inputs.drop(columns=columns_to_drop, axis=1, errors='ignore')

# Align inputs with the columns of X_train
inputs = inputs[X_train.columns]

# Fit the model to the training data
SV_model.fit(X_train, y_train)

# Calculate the accuracy score for the test set (optional)
score = SV_model.score(X_test, y_test) * 100
print(f"Prediction score on the test set is {score:.2f}%")


Prediction score on the test set is 90.91%


In [32]:
# use a Random Forest Classifier to train a model and evaluate its accuracy.[decision tree]
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(n_estimators=200)    #200 decision tree
clf.fit(X_train,y_train)
Score = clf.score(inputs,target)*100
print("prediction score is",Score)

prediction score is 97.59704251386322


In [31]:
# Import necessary libraries
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2, random_state=42)

# Initialize the Gradient Boosting Classifier
gbm_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Fit the model to the training data
gbm_model.fit(X_train, y_train)

# Predict the target values for the test set
y_pred = gbm_model.predict(X_test)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred) * 100

# Print the accuracy score
print(f"Prediction accuracy using GBM is {accuracy:.2f}%")     #Gradient Boosting Machine

Prediction accuracy using GBM is 86.24%
