### DATA CLEANING/PREPROCESSING

In [1]:
import numpy as np
import pandas as pd

NULL VALUES -
1. ROW REMOVE
2. COLUMN KA AVG
3. MEDIAN
4. MODE

#### calculating mean, median, mode

In [2]:
listt = [1, 2, 3, 4, 5, 5, 5]
df = pd.DataFrame(listt, columns=["values"])

In [3]:
df.head()

Unnamed: 0,values
0,1
1,2
2,3
3,4
4,5


In [4]:
# Mean
mean_value = df['values'].mean()

# Mean
median_value = df['values'].median()

# Mean
mode_value = df['values'].mode()[0]

print(f"""Stattistics:
    mean = {mean_value}
    median = {median_value}
    mode = {mode_value}
""")

Stattistics:
    mean = 3.5714285714285716
    median = 4.0
    mode = 5



## Preprocess.csv

In [5]:
data = pd.read_csv("preprocess.csv")
data.head()

Unnamed: 0,Student Name,Gender,Age,City,Course,Marks,Placement Status
0,Amit,Male,21.0,Mumbai,AI,85.0,Yes
1,Riya,Female,22.0,Pune,ML,90.0,Yes
2,Suresh,Male,,Delhi,AI,78.0,No
3,Neha,Female,21.0,,DS,,Yes
4,Rahul,,23.0,Mumbai,ML,88.0,No


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Student Name      9 non-null      object 
 1   Gender            8 non-null      object 
 2   Age               8 non-null      float64
 3   City              8 non-null      object 
 4   Course            9 non-null      object 
 5   Marks             8 non-null      float64
 6   Placement Status  9 non-null      object 
dtypes: float64(2), object(5)
memory usage: 692.0+ bytes


### Column names cleaninng 

In [76]:
data.columns

Index(['Student Name', 'Gender', 'Age', 'City', 'Course', 'Marks',
       'Placement Status'],
      dtype='object')

In [77]:
# Change column names -- no space should be in column
data.rename(columns={
    "Student Name" : "Student_name",
    "Placement Status" : "Placement_status"
}, inplace=True)

# OR 
# data = data.rename(columns={
#     "Student Name" : "Student_name",
#     "Placement Status" : "Placement_status"
# })

In [78]:
data.columns

Index(['Student_name', 'Gender', 'Age', 'City', 'Course', 'Marks',
       'Placement_status'],
      dtype='object')

In [79]:
data.columns = data.columns.str.lower()

In [80]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   student_name      9 non-null      object 
 1   gender            8 non-null      object 
 2   age               8 non-null      float64
 3   city              8 non-null      object 
 4   course            9 non-null      object 
 5   marks             8 non-null      float64
 6   placement_status  9 non-null      object 
dtypes: float64(2), object(5)
memory usage: 692.0+ bytes


In [7]:
## checking null values
data.isnull().sum()

Student Name        1
Gender              2
Age                 2
City                2
Course              1
Marks               2
Placement Status    1
dtype: int64

### student_name NULL CLEANING

In [81]:
data["student_name"]    
# in this there are some NaN -- to which we can't do like mean, median so remove that row

0      Amit
1      Riya
2    Suresh
3      Neha
4     Rahul
5     Pooja
6       NaN
7     Karan
8     Anita
9     Vikas
Name: student_name, dtype: object

In [82]:
data.dropna(subset=["student_name"], inplace=True)   
# in the column student_name where ever NaN is there , remove that row
data

Unnamed: 0,student_name,gender,age,city,course,marks,placement_status
0,Amit,Male,21.0,Mumbai,AI,85.0,Yes
1,Riya,Female,22.0,Pune,ML,90.0,Yes
2,Suresh,Male,,Delhi,AI,78.0,No
3,Neha,Female,21.0,,DS,,Yes
4,Rahul,,23.0,Mumbai,ML,88.0,No
5,Pooja,Female,22.0,Pune,,92.0,Yes
7,Karan,Male,,Mumbai,AI,75.0,No
8,Anita,Female,21.0,,ML,,Yes
9,Vikas,,23.0,Pune,DS,89.0,Yes


### age NULL CLEANING

In [83]:
age_mean = data["age"].mean().round()
print(age_mean)

22.0


In [84]:
data["age"] = data["age"].fillna(age_mean)

In [85]:
data

Unnamed: 0,student_name,gender,age,city,course,marks,placement_status
0,Amit,Male,21.0,Mumbai,AI,85.0,Yes
1,Riya,Female,22.0,Pune,ML,90.0,Yes
2,Suresh,Male,22.0,Delhi,AI,78.0,No
3,Neha,Female,21.0,,DS,,Yes
4,Rahul,,23.0,Mumbai,ML,88.0,No
5,Pooja,Female,22.0,Pune,,92.0,Yes
7,Karan,Male,22.0,Mumbai,AI,75.0,No
8,Anita,Female,21.0,,ML,,Yes
9,Vikas,,23.0,Pune,DS,89.0,Yes


### marks NULL CLEANING

In [86]:
marks_mean = data["marks"].mean().round()
print(marks_mean)

85.0


In [87]:
data["marks"] = data["marks"].fillna(marks_mean)

In [88]:
data

Unnamed: 0,student_name,gender,age,city,course,marks,placement_status
0,Amit,Male,21.0,Mumbai,AI,85.0,Yes
1,Riya,Female,22.0,Pune,ML,90.0,Yes
2,Suresh,Male,22.0,Delhi,AI,78.0,No
3,Neha,Female,21.0,,DS,85.0,Yes
4,Rahul,,23.0,Mumbai,ML,88.0,No
5,Pooja,Female,22.0,Pune,,92.0,Yes
7,Karan,Male,22.0,Mumbai,AI,75.0,No
8,Anita,Female,21.0,,ML,85.0,Yes
9,Vikas,,23.0,Pune,DS,89.0,Yes


## Categorical to integer data - Encoding

Computer does'nt understand categorical data so we need to convert them into integers
Types of Encoder
1. label Encoder
2. Oone hot Encoding
3. dummy variable

In [89]:
from sklearn.preprocessing import LabelEncoder

# create LabelEncoder object
label = LabelEncoder()

In [90]:
data

Unnamed: 0,student_name,gender,age,city,course,marks,placement_status
0,Amit,Male,21.0,Mumbai,AI,85.0,Yes
1,Riya,Female,22.0,Pune,ML,90.0,Yes
2,Suresh,Male,22.0,Delhi,AI,78.0,No
3,Neha,Female,21.0,,DS,85.0,Yes
4,Rahul,,23.0,Mumbai,ML,88.0,No
5,Pooja,Female,22.0,Pune,,92.0,Yes
7,Karan,Male,22.0,Mumbai,AI,75.0,No
8,Anita,Female,21.0,,ML,85.0,Yes
9,Vikas,,23.0,Pune,DS,89.0,Yes


In [91]:
# apply LableEncoder to placement_status -- Yes = 1, No = 0  -- automatic
data["placement_status"] = label.fit_transform(data["placement_status"])

# apply LableEncoder to gender -- Male = 1, Female = 0
data["gender"] = label.fit_transform(data["gender"])

# apply LableEncoder to city -- Mumbai = 1, Pune = 2, Delhi = 0, Nan = 3
data["city"] = label.fit_transform(data["city"])

# apply LableEncoder to course -- AI = 0, DS = 1, ML = 2
data["course"] = label.fit_transform(data["course"])


data

Unnamed: 0,student_name,gender,age,city,course,marks,placement_status
0,Amit,1,21.0,1,0,85.0,1
1,Riya,0,22.0,2,2,90.0,1
2,Suresh,1,22.0,0,0,78.0,0
3,Neha,0,21.0,3,1,85.0,1
4,Rahul,2,23.0,1,2,88.0,0
5,Pooja,0,22.0,2,3,92.0,1
7,Karan,1,22.0,1,0,75.0,0
8,Anita,0,21.0,3,2,85.0,1
9,Vikas,2,23.0,2,1,89.0,1
