# __Data Handling and Manipulation.__
Date : 15, May, 2024.

Importing libraries.

In [18]:
import pandas as pd
import numpy as np

Loading the dataset

In [19]:
filename = "sales.csv"
data = pd.read_csv(filename)
data.head()

Unnamed: 0,rate,sales_in_first_month,sales_in_second_month,sales_in_third_month
0,7.0,,150,200
1,8.0,120.0,180,240
2,,80.0,120,160
3,4.0,150.0,225,300
4,5.0,,135,180


Checking Missing Values.

In [20]:
# show number of missing values
data.isnull().sum()     

rate                     2
sales_in_first_month     3
sales_in_second_month    0
sales_in_third_month     0
dtype: int64

Checking for the data-type of the columns.

In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   rate                   8 non-null      float64
 1   sales_in_first_month   7 non-null      float64
 2   sales_in_second_month  10 non-null     int64  
 3   sales_in_third_month   10 non-null     int64  
dtypes: float64(2), int64(2)
memory usage: 452.0 bytes


Mean of Columns.

In [22]:
data.mean()

rate                       5.750000
sales_in_first_month     117.857143
sales_in_second_month    164.900000
sales_in_third_month     220.000000
dtype: float64

Handling missing or Nan values.

In [23]:
data['rate'].fillna(0, inplace = True)
data.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['rate'].fillna(0, inplace = True)


Unnamed: 0,rate,sales_in_first_month,sales_in_second_month,sales_in_third_month
0,7.0,,150,200
1,8.0,120.0,180,240
2,0.0,80.0,120,160
3,4.0,150.0,225,300
4,5.0,,135,180


In [24]:
data['sales_in_first_month'].fillna(118, inplace=True)
data.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['sales_in_first_month'].fillna(118, inplace=True)


Unnamed: 0,rate,sales_in_first_month,sales_in_second_month,sales_in_third_month
0,7.0,118.0,150,200
1,8.0,120.0,180,240
2,0.0,80.0,120,160
3,4.0,150.0,225,300
4,5.0,118.0,135,180


Converting data-type

In [25]:
data['sales_in_first_month'] = data['sales_in_first_month'].astype('int64')
data['rate'] = data['rate'].astype('int64')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   rate                   10 non-null     int64
 1   sales_in_first_month   10 non-null     int64
 2   sales_in_second_month  10 non-null     int64
 3   sales_in_third_month   10 non-null     int64
dtypes: int64(4)
memory usage: 452.0 bytes


Feature Selection

In [26]:
X = data.iloc[:, :3]  # take ssles upto 2nd month (3 column)
Y = data.iloc[:, -1]  # select last column  

In [27]:
X.head()

Unnamed: 0,rate,sales_in_first_month,sales_in_second_month
0,7,118,150
1,8,120,180
2,0,80,120
3,4,150,225
4,5,118,135


In [28]:
Y.head()

0    200
1    240
2    160
3    300
4    180
Name: sales_in_third_month, dtype: int64

Concatenating X and Y

In [29]:
df = pd.concat([X, Y], axis=1)
df.head()

Unnamed: 0,rate,sales_in_first_month,sales_in_second_month,sales_in_third_month
0,7,118,150,200
1,8,120,180,240
2,0,80,120,160
3,4,150,225,300
4,5,118,135,180


In [30]:
df.dtypes

rate                     int64
sales_in_first_month     int64
sales_in_second_month    int64
sales_in_third_month     int64
dtype: object

Saving Data after cleaning.

In [31]:
df.to_csv('clean_sales.csv')

Fitting the linear Model.

In [32]:
from sklearn.linear_model import LinearRegression

In [34]:
clf = LinearRegression()
clf.fit(X, Y)
print(clf.score(X, Y))

0.9999738258921149


Saving the Model.

In [36]:
import pickle

In [None]:
pickle.dump(clf, open('model.pkl', 'wb'))