In [1]:
# import the pandas library
import pandas as pd
import numpy as np
#Creating a DataFrame with Missing Values
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'], columns=['C01', 'C02', 'C03'])
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print("\n Reindexed Data Values")
print("-------------------------")
print(df)
#Method 1 - Filling Every Missing Values with 0
print("\n\n Every Missing Value Replaced with '0':")
print("--------------------------------------------")
print(df.fillna(0))
#Method 2 - Dropping Rows Having Missing Values
print("\n\n Dropping Rows with Missing Values:")
print("----------------------------------------")
print(df.dropna())
#Method 3 - Replacing missing values with the Median
Valuemedian = df['C01'].median()
df['C01'].fillna(Valuemedian, inplace=True)
print("\n\n Missing Values for Column 1 Replaced with Median Value:")
print("--------------------------------------------------")
print(df)


 Reindexed Data Values
-------------------------
        C01       C02       C03
a  0.022542  0.511028 -0.420655
b       NaN       NaN       NaN
c  0.169507 -1.345512 -0.843278
d       NaN       NaN       NaN
e -0.196764  0.462977  0.253172
f  0.552671  0.175541 -0.044932
g       NaN       NaN       NaN
h -1.256212  1.609742  0.747193


 Every Missing Value Replaced with '0':
--------------------------------------------
        C01       C02       C03
a  0.022542  0.511028 -0.420655
b  0.000000  0.000000  0.000000
c  0.169507 -1.345512 -0.843278
d  0.000000  0.000000  0.000000
e -0.196764  0.462977  0.253172
f  0.552671  0.175541 -0.044932
g  0.000000  0.000000  0.000000
h -1.256212  1.609742  0.747193


 Dropping Rows with Missing Values:
----------------------------------------
        C01       C02       C03
a  0.022542  0.511028 -0.420655
c  0.169507 -1.345512 -0.843278
e -0.196764  0.462977  0.253172
f  0.552671  0.175541 -0.044932
h -1.256212  1.609742  0.747193


 Missing Value

In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import scipy.stats as s
#Creating a DataFrame
d = {'C01':[1,3,7,4],'C02':[12,2,7,1],'C03':[22,34,-11,9]}
df2 = pd.DataFrame(d)
print("\n ORIGINAL DATA VALUES")
print("------------------------")
print(df2)


 ORIGINAL DATA VALUES
------------------------
   C01  C02  C03
0    1   12   22
1    3    2   34
2    7    7  -11
3    4    1    9


In [3]:
#Method 1: Rescaling Data
print("\n\n Data Scaled Between 0 to 1")
data_scaler = preprocessing.MinMaxScaler(feature_range = (0, 1))
data_scaled = data_scaler.fit_transform(df2)
print("\n Min Max Scaled Data ")
print("-----------------------")
print(data_scaled.round(2))



 Data Scaled Between 0 to 1

 Min Max Scaled Data 
-----------------------
[[0.   1.   0.73]
 [0.33 0.09 1.  ]
 [1.   0.55 0.  ]
 [0.5  0.   0.44]]


In [4]:
#Method 2: Normalization rescales such that sum of each row is 1.
dn = preprocessing.normalize(df2, norm = 'l1')
print("\n L1 Normalized Data ")
print(" ----------------------")
print(dn.round(2))


 L1 Normalized Data 
 ----------------------
[[ 0.03  0.34  0.63]
 [ 0.08  0.05  0.87]
 [ 0.28  0.28 -0.44]
 [ 0.29  0.07  0.64]]


In [5]:
#Method 3: Binarize Data (Make Binary)
data_binarized = preprocessing.Binarizer(threshold=5).transform(df2)
print("\n Binarized data ")
print(" -----------------")
print(data_binarized)


 Binarized data 
 -----------------
[[0 1 1]
 [0 0 1]
 [1 1 0]
 [0 0 1]]


In [6]:
#Method 4: Standardizing Data
print("\n Standardizing Data ")
print("----------------------")
X_train = np.array([[ 1., -1., 2.],[ 2., 0., 0.],[ 0., 1., -1.]])
print(" Orginal Data \n", X_train)
print("\n Initial Mean : ", s.tmean(X_train).round(2))
print(" Initial Standard Deviation : ",round(X_train.std(),2))
X_scaled = preprocessing.scale(X_train)
X_scaled.mean(axis=0)
X_scaled.std(axis=0)
print("\n Standardized Data \n", X_scaled.round(2))
print("\n Scaled Mean : ",s.tmean(X_scaled).round(2))
print(" Scaled Standard Deviation : ",round(X_scaled.std(),2))


 Standardizing Data 
----------------------
 Orginal Data 
 [[ 1. -1.  2.]
 [ 2.  0.  0.]
 [ 0.  1. -1.]]

 Initial Mean :  0.44
 Initial Standard Deviation :  1.07

 Standardized Data 
 [[ 0.   -1.22  1.34]
 [ 1.22  0.   -0.27]
 [-1.22  1.22 -1.07]]

 Scaled Mean :  0.0
 Scaled Standard Deviation :  1.0


In [7]:
#Program for Equal Width Binning
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
#Create a Dataframe
d={'item':['Shirt','Sweater','BodyWarmer','Baby_Napkin'],
'price':[ 1250,1160,2842,1661]}
#print the Dataframe
df = pd.DataFrame(d)
print("\n ORIGINAL DATASET")
print(" ----------------")
print(df)
#Creating bins
m1=min(df["price"])
m2=max(df["price"])
bins=np.linspace(m1,m2,4)
names=["low", "medium", "high"]
df["price_bin"]=pd.cut(df["price"],bins,labels=names,include_lowest=True)
print("\n BINNED DATASET")
print(" ----------------")
print(df)


 ORIGINAL DATASET
 ----------------
          item  price
0        Shirt   1250
1      Sweater   1160
2   BodyWarmer   2842
3  Baby_Napkin   1661

 BINNED DATASET
 ----------------
          item  price price_bin
0        Shirt   1250       low
1      Sweater   1160       low
2   BodyWarmer   2842      high
3  Baby_Napkin   1661       low


In [8]:
import pandas as pd
url = 'https://github.com/suneet10/DataPreprocessing/blob/main/Data.csv?raw=true';
df = pd.read_csv(url, index_col=0)
df

Unnamed: 0_level_0,Age,Salary,Purchased
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
France,44.0,72000.0,No
Spain,27.0,48000.0,Yes
Germany,30.0,54000.0,No
Spain,38.0,61000.0,No
Germany,40.0,,Yes
France,35.0,58000.0,Yes
Spain,,52000.0,No
France,48.0,79000.0,Yes
Germany,50.0,83000.0,No
France,37.0,67000.0,Yes


In [9]:
df.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


In [10]:
df.shape

(10, 3)

In [11]:
import pandas as pd
url = 'https://github.com/suneet10/DataPreprocessing/blob/main/Data.csv?raw=true';
df = pd.read_csv(url, index_col=0)
#c) Display first 3 rows from dataset
df.head(3)

Unnamed: 0_level_0,Age,Salary,Purchased
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
France,44.0,72000.0,No
Spain,27.0,48000.0,Yes
Germany,30.0,54000.0,No


In [12]:
#2. Handling Missing Value: a) Replace missing value of salary,age column with mean of that column.
ValuemeanAge = df['Age'].mean()
df['Age'].fillna(ValuemeanAge, inplace=True)
df

Unnamed: 0_level_0,Age,Salary,Purchased
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
France,44.0,72000.0,No
Spain,27.0,48000.0,Yes
Germany,30.0,54000.0,No
Spain,38.0,61000.0,No
Germany,40.0,,Yes
France,35.0,58000.0,Yes
Spain,38.777778,52000.0,No
France,48.0,79000.0,Yes
Germany,50.0,83000.0,No
France,37.0,67000.0,Yes


In [13]:
#2. Handling Missing Value: a) Replace missing value of salary,age column with mean of that column.
ValuemeanAge = df['Salary'].mean()
df['Salary'].fillna(ValuemeanAge, inplace=True)
df

Unnamed: 0_level_0,Age,Salary,Purchased
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
France,44.0,72000.0,No
Spain,27.0,48000.0,Yes
Germany,30.0,54000.0,No
Spain,38.0,61000.0,No
Germany,40.0,63777.777778,Yes
France,35.0,58000.0,Yes
Spain,38.777778,52000.0,No
France,48.0,79000.0,Yes
Germany,50.0,83000.0,No
France,37.0,67000.0,Yes


In [26]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Load the dataset from the provided URL
url = "https://github.com/suneet10/DataPreprocessing/raw/main/Data.csv"
df = pd.read_csv(url)

# Display the original dataset
print("Original Dataset:")
print(df)

# Task a: Apply OneHot encoding on the 'Country' column
onehot_encoder = OneHotEncoder(sparse=False, drop='first')  # 'drop' parameter removes one of the columns to avoid multicollinearity
country_encoded = onehot_encoder.fit_transform(df[['Country']])
country_encoded_df = pd.DataFrame(country_encoded, columns=onehot_encoder.get_feature_names(['Country']))
df = pd.concat([df, country_encoded_df], axis=1)
df.drop(['Country'], axis=1, inplace=True)

# Display the dataset after OneHot encoding
print("\nDataset after OneHot encoding on 'Country' column:")
print(df)

Original Dataset:
   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes

Dataset after OneHot encoding on 'Country' column:
    Age   Salary Purchased  Country_Germany  Country_Spain
0  44.0  72000.0        No              0.0            0.0
1  27.0  48000.0       Yes              0.0            1.0
2  30.0  54000.0        No              1.0            0.0
3  38.0  61000.0        No              0.0            1.0
4  40.0      NaN       Yes              1.0            0.0
5  35.0  58000.0       Yes              0.0            0.0
6   NaN  52000.0        No              0.0            1.0
7  48.0  79000.0       Yes              0.0            0.0
8  

In [27]:
# Task b: Apply Label encoding on the 'Purchased' column
label_encoder = LabelEncoder()
df['Purchased'] = label_encoder.fit_transform(df['Purchased'])

# Display the final dataset after Label encoding
print("\nFinal Dataset after Label encoding on 'Purchased' column:")
print(df)


Final Dataset after Label encoding on 'Purchased' column:
    Age   Salary  Purchased  Country_Germany  Country_Spain
0  44.0  72000.0          0              0.0            0.0
1  27.0  48000.0          1              0.0            1.0
2  30.0  54000.0          0              1.0            0.0
3  38.0  61000.0          0              0.0            1.0
4  40.0      NaN          1              1.0            0.0
5  35.0  58000.0          1              0.0            0.0
6   NaN  52000.0          0              0.0            1.0
7  48.0  79000.0          1              0.0            0.0
8  50.0  83000.0          0              1.0            0.0
9  37.0  67000.0          1              0.0            0.0


In [16]:
import pandas as pd
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv?raw=true";
df = pd.read_csv(url,sep = ";")
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [17]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
df = min_max_scaler.fit_transform(df)
#df = df.apply(lambda x: 0 if x.strip()=='N' else 1)
df

array([[0.24778761, 0.39726027, 0.        , ..., 0.13772455, 0.15384615,
        0.4       ],
       [0.28318584, 0.52054795, 0.        , ..., 0.20958084, 0.21538462,
        0.4       ],
       [0.28318584, 0.43835616, 0.04      , ..., 0.19161677, 0.21538462,
        0.4       ],
       ...,
       [0.15044248, 0.26712329, 0.13      , ..., 0.25149701, 0.4       ,
        0.6       ],
       [0.11504425, 0.35958904, 0.12      , ..., 0.22754491, 0.27692308,
        0.4       ],
       [0.12389381, 0.13013699, 0.47      , ..., 0.19760479, 0.4       ,
        0.6       ]])

In [18]:
#2. Rescaling: Normalised the dataset using MinMaxScaler class
import pandas, scipy, numpy
from sklearn.preprocessing import MinMaxScaler
df=pandas.read_csv( 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv ',sep=';')
array=df.values
#Separating data into input and output components
x=array[:,0:8]
y=array[:,8]
scaler=MinMaxScaler(feature_range=(0,1))
rescaledX=scaler.fit_transform(x)
numpy.set_printoptions(precision=3) #Setting precision for the output
rescaledX[0:5,:]

array([[0.248, 0.397, 0.   , 0.068, 0.107, 0.141, 0.099, 0.568],
       [0.283, 0.521, 0.   , 0.116, 0.144, 0.338, 0.216, 0.494],
       [0.283, 0.438, 0.04 , 0.096, 0.134, 0.197, 0.17 , 0.509],
       [0.584, 0.11 , 0.56 , 0.068, 0.105, 0.225, 0.191, 0.582],
       [0.248, 0.397, 0.   , 0.068, 0.107, 0.141, 0.099, 0.568]])

In [19]:
#3. Standardizing Data (transform them into a standard Gaussian distribution with a mean of 0 and a standard deviation of 1)
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler().fit(x)
rescaledX=scaler.transform(x)
rescaledX[0:5,:]

array([[-0.528,  0.962, -1.391, -0.453, -0.244, -0.466, -0.379,  0.558],
       [-0.299,  1.967, -1.391,  0.043,  0.224,  0.873,  0.624,  0.028],
       [-0.299,  1.297, -1.186, -0.169,  0.096, -0.084,  0.229,  0.134],
       [ 1.655, -1.384,  1.484, -0.453, -0.265,  0.108,  0.412,  0.664],
       [-0.528,  0.962, -1.391, -0.453, -0.244, -0.466, -0.379,  0.558]])

In [20]:
# 4. Normalizing Data ( rescale each observation to a length of 1 (a unit norm). For this, use the Normalizer class.)
from sklearn.preprocessing import Normalizer
scaler=Normalizer().fit(x)
normalizedX=scaler.transform(x)
normalizedX[0:5,:]

array([[2.024e-01, 1.914e-02, 0.000e+00, 5.196e-02, 2.079e-03, 3.008e-01,
        9.299e-01, 2.729e-02],
       [1.083e-01, 1.222e-02, 0.000e+00, 3.611e-02, 1.361e-03, 3.472e-01,
        9.306e-01, 1.385e-02],
       [1.377e-01, 1.342e-02, 7.061e-04, 4.060e-02, 1.624e-03, 2.648e-01,
        9.533e-01, 1.760e-02],
       [1.767e-01, 4.416e-03, 8.833e-03, 2.997e-02, 1.183e-03, 2.681e-01,
        9.464e-01, 1.574e-02],
       [2.024e-01, 1.914e-02, 0.000e+00, 5.196e-02, 2.079e-03, 3.008e-01,
        9.299e-01, 2.729e-02]])

In [21]:
#5. Binarizing Data using we use the Binarizer class (Using a binary threshold, it is possible to transform our data by marking the values above it 1 and those equal to or below it, 0)
from sklearn.preprocessing import Binarizer
binarizer=Binarizer(threshold=0.0).fit(x)
binaryX=binarizer.transform(x)
binaryX[0:5,:]

array([[1., 1., 0., 1., 1., 1., 1., 1.],
       [1., 1., 0., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 0., 1., 1., 1., 1., 1.]])

In [22]:
#Set C
#Import dataset and perform Discretization of Continuous Data
#Dataset name: Student_bucketing.csv
#Dataset link: https://github.com/TrainingByPackt/Data-Science-with-Python/blob/master/Chapter01/Data/Student_bucketing.csv

# 1 Write python code to import the required libraries and load the dataset into a pandas dataframe.
import pandas as pd
df = pd.read_csv('https://github.com/TrainingByPackt/Data-Science-with-Python/blob/master/Chapter01/Data/Student_bucketing.csv?raw=true')
df

Unnamed: 0,Student_id,Age,Grade,Employed,marks
0,1,19,1st Class,yes,29
1,2,20,2nd Class,no,41
2,3,18,1st Class,no,57
3,4,21,2nd Class,no,29
4,5,19,1st Class,no,57
...,...,...,...,...,...
227,228,21,1st Class,no,42
228,229,20,2nd Class,no,47
229,230,20,3rd Class,yes,21
230,231,19,1st Class,yes,64


In [23]:
#2) Display the first five rows of the dataframe.
df.head()

Unnamed: 0,Student_id,Age,Grade,Employed,marks
0,1,19,1st Class,yes,29
1,2,20,2nd Class,no,41
2,3,18,1st Class,no,57
3,4,21,2nd Class,no,29
4,5,19,1st Class,no,57


In [24]:
# 3) Discretized the marks column into five discrete buckets, the labels need to be populated accordingly with five values: Poor, Below_average, Average, Above_average, and Excellent. 
#Perform bucketing using the cut () function on the marks column and display the top 10 columns.
df['bucket']=pd.cut(df['marks'],5,labels=['Poor','Below_average','Average','Above_Average','Excellent'])
df.head(10)

Unnamed: 0,Student_id,Age,Grade,Employed,marks,bucket
0,1,19,1st Class,yes,29,Poor
1,2,20,2nd Class,no,41,Below_average
2,3,18,1st Class,no,57,Average
3,4,21,2nd Class,no,29,Poor
4,5,19,1st Class,no,57,Average
5,6,20,2nd Class,yes,53,Average
6,7,19,3rd Class,yes,78,Above_Average
7,8,21,3rd Class,yes,70,Above_Average
8,9,22,3rd Class,yes,97,Excellent
9,10,21,1st Class,no,58,Average


In [25]:
import pandas as pd
df = pd.read_csv('https://github.com/TrainingByPackt/Data-Science-with-Python/blob/master/Chapter01/Data/Student_bucketing.csv?raw=true')
df
df.head(10)

Unnamed: 0,Student_id,Age,Grade,Employed,marks
0,1,19,1st Class,yes,29
1,2,20,2nd Class,no,41
2,3,18,1st Class,no,57
3,4,21,2nd Class,no,29
4,5,19,1st Class,no,57
5,6,20,2nd Class,yes,53
6,7,19,3rd Class,yes,78
7,8,21,3rd Class,yes,70
8,9,22,3rd Class,yes,97
9,10,21,1st Class,no,58
