<a href="https://colab.research.google.com/github/Sadikshya-dhakal/AI/blob/main/DataSciencelab1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Data Preprocessing: Cleaning and integrating datasets from multiple sources.**

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from scipy import stats

In [None]:
# creatinu a manual dataset

data=pd.DataFrame({
    'CustomerId':[101,105,103,104,108,102,105],
    'name':['ram', 'hari','rita','gita',None, 'krishna','sita'],
    'age':[12,24,None, 25,17,None,18],
    'purchase_amount':[100.5,None,85.3,100.5,50.0,150.0,None],
    'date_of_purchase':['2025/11/20','2025/11/22','2025/11/21','2025/11/20','2025/12/25','2025/11/18','2025/11/26'],
})
data

Unnamed: 0,CustomerId,name,age,purchase_amount,date_of_purchase
0,101,ram,12.0,100.5,2025/11/20
1,105,hari,24.0,,2025/11/22
2,103,rita,,85.3,2025/11/21
3,104,gita,25.0,100.5,2025/11/20
4,108,,17.0,50.0,2025/12/25
5,102,krishna,,150.0,2025/11/18
6,105,sita,18.0,,2025/11/26


# Data Cleaning
Data cleaning is the process of preparing raw data by handling missing values, correcting errors, and removing duplicates. It ensures consistency, accuracy, and reliability of datasets. Clean data improves analysis quality and enhances the performance of machine learning models.

In [None]:
# Handling Missing Values

imputer=SimpleImputer(strategy='mean')
data[[ 'age', 'purchase_amount']] = imputer.fit_transform(data[['age', 'purchase_amount']])
data

Unnamed: 0,CustomerId,name,age,purchase_amount,date_of_purchase
0,101,ram,12.0,100.5,2025/11/20
1,105,hari,24.0,97.26,2025/11/22
2,103,rita,19.2,85.3,2025/11/21
3,104,gita,25.0,100.5,2025/11/20
4,108,,17.0,50.0,2025/12/25
5,102,krishna,19.2,150.0,2025/11/18
6,105,sita,18.0,97.26,2025/11/26


In [None]:
# Removing Duplicate Data

data=data.drop_duplicates()
data

Unnamed: 0,CustomerId,name,age,purchase_amount,date_of_purchase
0,101,ram,12.0,100.5,2025/11/20
1,105,hari,24.0,97.26,2025/11/22
2,103,rita,19.2,85.3,2025/11/21
3,104,gita,25.0,100.5,2025/11/20
4,108,,17.0,50.0,2025/12/25
5,102,krishna,19.2,150.0,2025/11/18
6,105,sita,18.0,97.26,2025/11/26


In [None]:
# Correcting Inconsistence format

data.loc[:, 'date_of_purchase']=pd.to_datetime(data['date_of_purchase'], errors='coerce')
data

Unnamed: 0,CustomerId,name,age,purchase_amount,date_of_purchase
0,101,ram,12.0,100.5,2025-11-20 00:00:00
1,105,hari,24.0,97.26,2025-11-22 00:00:00
2,103,rita,19.2,85.3,2025-11-21 00:00:00
3,104,gita,25.0,100.5,2025-11-20 00:00:00
4,108,,17.0,50.0,2025-12-25 00:00:00
5,102,krishna,19.2,150.0,2025-11-18 00:00:00
6,105,sita,18.0,97.26,2025-11-26 00:00:00


# Data Integration
Data integration in data science is the process of combining data from multiple sources into a single, unified view so it can be analyzed consistently and effectively. It ensures that information stored in different formats or locations becomes coherent, accessible, and useful for decision-making.


In [12]:
data1=pd.read_csv('data_2.csv')
data1

Unnamed: 0,CustomerId,Gender,Daily_Income
0,101,Male,800
1,105,Male,750
2,103,Female,850
3,104,Male,650
4,108,,700
5,102,Female,-5000


In [13]:
# Identifying Outliers

Q1=data1['Daily_Income'].quantile(0.25)
Q3=data1['Daily_Income'].quantile(0.75)
IQR=Q3-Q1
outliers=(data1['Daily_Income']<(Q1-1.5*IQR))|(data1['Daily_Income']>(Q3+1.5*IQR))
outliers

Unnamed: 0,Daily_Income
0,False
1,False
2,False
3,False
4,False
5,True


In [14]:
z_scores = stats.zscore(data1['Daily_Income'])
outliers=abs(z_scores)>2
outliers


array([False, False, False, False, False,  True])

In [15]:
#Removing outliers
data_no_outliers=data1[~outliers].reset_index(drop=True)
data_no_outliers

Unnamed: 0,CustomerId,Gender,Daily_Income
0,101,Male,800
1,105,Male,750
2,103,Female,850
3,104,Male,650
4,108,,700


In [16]:
#Merging data set on a common key'customer_id'
merged_data=pd.merge(data,data_no_outliers,on='CustomerId',how='inner')
merged_data

Unnamed: 0,CustomerId,name,age,purchase_amount,date_of_purchase,Gender,Daily_Income
0,101,ram,12.0,100.5,2025-11-20 00:00:00,Male,800
1,105,hari,24.0,97.26,2025-11-22 00:00:00,Male,750
2,103,rita,19.2,85.3,2025-11-21 00:00:00,Female,850
3,104,gita,25.0,100.5,2025-11-20 00:00:00,Male,650
4,108,,17.0,50.0,2025-12-25 00:00:00,,700
5,105,sita,18.0,97.26,2025-11-26 00:00:00,Male,750


# Data Transformation
Data transformation in data science is the process of converting raw data into a suitable format for analysis. It involves cleaning, normalizing, aggregating, or encoding data so that it becomes consistent and usable. Transformation ensures that diverse datasets can be integrated, compared, and effectively used in machine learning models.

In [17]:
#Scaling numeric data
scaler=StandardScaler()
merged_data['standard_purchase_amt']=scaler.fit_transform(merged_data[['purchase_amount']])
merged_data


Unnamed: 0,CustomerId,name,age,purchase_amount,date_of_purchase,Gender,Daily_Income,standard_purchase_amt
0,101,ram,12.0,100.5,2025-11-20 00:00:00,Male,800,0.670066
1,105,hari,24.0,97.26,2025-11-22 00:00:00,Male,750,0.489599
2,103,rita,19.2,85.3,2025-11-21 00:00:00,Female,850,-0.176568
3,104,gita,25.0,100.5,2025-11-20 00:00:00,Male,650,0.670066
4,108,,17.0,50.0,2025-12-25 00:00:00,,700,-2.142763
5,105,sita,18.0,97.26,2025-11-26 00:00:00,Male,750,0.489599


In [18]:
from pandas.core.arrays import sparse
#Encoding categorical variables using one-hot encoding
encoder=OneHotEncoder(sparse_output=False)
encoded_data=pd.DataFrame(encoder.fit_transform(merged_data[['Gender']]),columns=encoder.get_feature_names_out(['Gender']))
data_all=data=pd.concat([merged_data, encoded_data],axis=1)
data_all

Unnamed: 0,CustomerId,name,age,purchase_amount,date_of_purchase,Gender,Daily_Income,standard_purchase_amt,Gender_Female,Gender_Male,Gender_nan
0,101,ram,12.0,100.5,2025-11-20 00:00:00,Male,800,0.670066,0.0,1.0,0.0
1,105,hari,24.0,97.26,2025-11-22 00:00:00,Male,750,0.489599,0.0,1.0,0.0
2,103,rita,19.2,85.3,2025-11-21 00:00:00,Female,850,-0.176568,1.0,0.0,0.0
3,104,gita,25.0,100.5,2025-11-20 00:00:00,Male,650,0.670066,0.0,1.0,0.0
4,108,,17.0,50.0,2025-12-25 00:00:00,,700,-2.142763,0.0,0.0,1.0
5,105,sita,18.0,97.26,2025-11-26 00:00:00,Male,750,0.489599,0.0,1.0,0.0
