In [None]:
import pandas as pd # Importing pandas for manupilation
import numpy as np # Importing numpy for numerical computations
from sklearn.preprocessing import StandardScaler,MinMaxScaler,LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split

In [None]:
# Step 1: Load Dataset (CSV file will be created separately)
df = pd.read_csv('/content/candy-data.csv') # Reading dataset into a pandas DataFrame
print("Original Data:")
print(df.head()) #Display the first few rows of the data set

Original Data:
  competitorname  chocolate  fruity  caramel  peanutyalmondy  nougat  \
0      100 Grand          1       0        1               0       0   
1   3 Musketeers          1       0        0               0       1   
2       One dime          0       0        0               0       0   
3    One quarter          0       0        0               0       0   
4      Air Heads          0       1        0               0       0   

   crispedricewafer  hard  bar  pluribus  sugarpercent  pricepercent  \
0                 1     0    1         0         0.732         0.860   
1                 0     0    1         0         0.604         0.511   
2                 0     0    0         0         0.011         0.116   
3                 0     0    0         0         0.011         0.511   
4                 0     0    0         0         0.906         0.511   

   winpercent  
0   66.971725  
1   67.602936  
2   32.261086  
3   46.116505  
4   52.341465  


In [None]:
# Step 2: Handling Missing Values
print("\nChecking for missing values:")
print(df.isnull().sum()) # Count missing values per column
df = df.dropna() # Drop rows with missing values (Alternative: df.fillna(value) to fill missing values)


Checking for missing values:
competitorname      0
chocolate           0
fruity              0
caramel             0
peanutyalmondy      0
nougat              0
crispedricewafer    0
hard                0
bar                 0
pluribus            0
sugarpercent        0
pricepercent        0
winpercent          0
dtype: int64


In [None]:
# Step 3: Handling Duplicates
print("\nChecking for duplicate:")
print(df.duplicated().sum()) # Count duplicate rows
df = df.drop_duplicates() # Remove duplicate rows


Checking for duplicate:
0


In [None]:
# Step 4: Encoding Categorical Variables
print("\nEncoding categorical variables:")
categorical_cols_ = df.select_dtypes(include=['object']).columns # Selecting categorical columns
label_encoders = {}
for col in categorical_cols_:
  le = LabelEncoder()
  df[col] = le.fit_transform(df[col]) # Apply Label Encoding
  label_encoders[col] = le
  print(df.head()) # Display tranformed dataset


Encoding categorical variables:
   competitorname  chocolate  fruity  caramel  peanutyalmondy  nougat  \
0               0          1       0        1               0       0   
1               1          1       0        0               0       1   
2              44          0       0        0               0       0   
3              45          0       0        0               0       0   
4               2          0       1        0               0       0   

   crispedricewafer  hard  bar  pluribus  sugarpercent  pricepercent  \
0                 1     0    1         0         0.732         0.860   
1                 0     0    1         0         0.604         0.511   
2                 0     0    0         0         0.011         0.116   
3                 0     0    0         0         0.011         0.511   
4                 0     0    0         0         0.906         0.511   

   winpercent  
0   66.971725  
1   67.602936  
2   32.261086  
3   46.116505  
4   52.341465  

In [None]:
# Step 5: Featuring Scaling
print("\nApplying feature scalling:")
numeric_cols = df.select_dtypes(include=[np.number]).columns # Selecting numeric columns
scaler = StandardScaler() # Initializing standard scaler
df[numeric_cols] = scaler.fit_transform(df[numeric_cols]) # standardizing numerical features
print(df.head()) # Display scaled dataset


Applying feature scalling:
   competitorname  chocolate    fruity   caramel  peanutyalmondy    nougat  \
0       -1.711792   1.138990 -0.899172  2.251983       -0.444053 -0.299572   
1       -1.671035   1.138990 -0.899172 -0.444053       -0.444053  3.338092   
2        0.081514  -0.877971 -0.899172 -0.444053       -0.444053 -0.299572   
3        0.122271  -0.877971 -0.899172 -0.444053       -0.444053 -0.299572   
4       -1.630278  -0.877971  1.112134 -0.444053       -0.444053 -0.299572   

   crispedricewafer     hard       bar  pluribus  sugarpercent  pricepercent  \
0          3.338092 -0.46291  1.745743  -1.03594      0.901261      1.376914   
1         -0.299572 -0.46291  1.745743  -1.03594      0.445922      0.148273   
2         -0.299572 -0.46291 -0.572822  -1.03594     -1.663576     -1.242308   
3         -0.299572 -0.46291 -0.572822  -1.03594     -1.663576      0.148273   
4         -0.299572 -0.46291 -0.572822  -1.03594      1.520236      0.148273   

   winpercent  
0    1

In [None]:
# Step 6: Spliting Dataset into training and Testing Sets
print("\nSplitting dataset into training and testing sts:")
x = df.drop(columns=['chocolate']) # Assuming 'chocolate' is the target variable
y = df['chocolate']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)
print(f"Training set size:{x_train.shape}, Testing set size:{x_test.shape}")


Splitting dataset into training and testing sts:
Training set size:(68, 12), Testing set size:(17, 12)
