In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
data=pd.read_csv("Customers.csv")

In [3]:
data.head(3)

Unnamed: 0,CustomerID,Gender,Age,Salary,Spending_Score
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   CustomerID      200 non-null    int64 
 1   Gender          200 non-null    object
 2   Age             200 non-null    int64 
 3   Salary          200 non-null    int64 
 4   Spending_Score  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB


In [6]:
data.describe()

Unnamed: 0,CustomerID,Age,Salary,Spending_Score
count,200.0,200.0,200.0,200.0
mean,100.5,38.85,60.56,50.2
std,57.879185,13.969007,26.264721,25.823522
min,1.0,18.0,15.0,1.0
25%,50.75,28.75,41.5,34.75
50%,100.5,36.0,61.5,50.0
75%,150.25,49.0,78.0,73.0
max,200.0,70.0,137.0,99.0


In [7]:
### Filtering

filtered_data=data[data['Age']>30]
print(filtered_data)

     CustomerID  Gender  Age  Salary  Spending_Score
4             5  Female   31      17              40
6             7  Female   35      18               6
8             9    Male   64      19               3
10           11    Male   67      19              14
11           12  Female   35      19              99
..          ...     ...  ...     ...             ...
194         195  Female   47     120              16
195         196  Female   35     120              79
196         197  Female   45     126              28
197         198    Male   32     126              74
198         199    Male   32     137              18

[138 rows x 5 columns]


In [8]:
### renaming

n_data=data["Gender"].replace({'Female':'F','Male':'M'})
print(n_data)

0      M
1      M
2      F
3      F
4      F
      ..
195    F
196    F
197    M
198    M
199    M
Name: Gender, Length: 200, dtype: object


In [9]:
### Multiple conditions

filter_data=data[(data['Age']>=25)&(data['Age']<=35)&(data["Salary"]>100)]

filter_data

Unnamed: 0,CustomerID,Gender,Age,Salary,Spending_Score
187,188,Male,28,101,68
190,191,Female,34,103,23
191,192,Female,32,103,69
192,193,Male,33,113,8
195,196,Female,35,120,79
197,198,Male,32,126,74
198,199,Male,32,137,18
199,200,Male,30,137,83


In [11]:
#### Counting Gender categories

male_count=0
female_count=0

for gender in data["Gender"]:
  if gender=='Male':
    male_count+=1
  elif gender=='Female':
    female_count+=1

print(male_count)
print(female_count)

88
112


In [12]:
### Categorize the age group

young=[]
middle_age=[]
senior=[]

## Iterate through the age column

for age in data["Age"]:
  if age<30:
    young.append(age)
  elif 30<=age<60:
    middle_age.append(age)
  else:
    senior.append(age)

In [16]:
len(young)


55

In [18]:
len(middle_age)


125

In [19]:
len(senior)

20

In [20]:
### Iterate throug each columns

for coulmn_name in data.columns:
  if data[coulmn_name].dtype in [int,float]:
    min_val=data[coulmn_name].min()
    max_val=data[coulmn_name].max()
    print(f"column:{coulmn_name}")
    print(f"Minmium Values: {min_val}")
    print(f"Maximum Values: {max_val}")


column:CustomerID
Minmium Values: 1
Maximum Values: 200
column:Age
Minmium Values: 18
Maximum Values: 70
column:Salary
Minmium Values: 15
Maximum Values: 137
column:Spending_Score
Minmium Values: 1
Maximum Values: 99


### **Functions**

In [21]:
## Function for data preprocessing

def preprocess_data(data):
  data.dropna(inplace=True) # Handling missing values
  data["Age"]=data["Age"].astype(int) # converting your data type
  return data

In [22]:
data=preprocess_data(data)
data.head()

Unnamed: 0,CustomerID,Gender,Age,Salary,Spending_Score
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [23]:
#### Custon data transformation

def transform_age(age):
  if age<30:
    return "Young"
  elif 30<=age<60:
    return "Middle_aged"
  else:
    return "Senior"

In [24]:
## creating a nee variable
data["Age_group"]=data['Age'].apply(transform_age)
data.head()

Unnamed: 0,CustomerID,Gender,Age,Salary,Spending_Score,Age_group
0,1,Male,19,15,39,Young
1,2,Male,21,15,81,Young
2,3,Female,20,16,6,Young
3,4,Female,23,16,77,Young
4,5,Female,31,17,40,Middle_aged


In [27]:
### Create a function to do specific data analysis

def calculate_summary(data):
  summary={
      "Mean Age": data["Age"].mean(),
      "Median Age": data["Age"].median(),
      "Min Age": data["Age"].min(),
      "Max Age": data["Age"].max()
  }
  return summary

In [29]:
stats=calculate_summary(data)
stats

{'Mean Age': 38.85, 'Median Age': 36.0, 'Min Age': 18, 'Max Age': 70}

In [30]:
for key,value in stats.items():
  print(f"{key}: {value}")


Mean Age: 38.85
Median Age: 36.0
Min Age: 18
Max Age: 70


### **OOP**

In [31]:
class BaseAnalyzer:
  def __init__(self,filepath):
    self.df=pd.read_csv(filepath)

  def dataset_dimentions(self):
    return self.df.shape

  def column_names(self):
    return self.df.columns.tolist()

  def display_head(self, rows=6):
    return self.df.head(rows)

  def display_tail(self, rows=5):
    return self.df.tail(rows)

In [32]:
class AdvanceDataAnalyzer(BaseAnalyzer):
  def check_missing_values(self):
    return self.df.isnull().sum()

  def duplicates_check(self):
    return self.df.duplicated().sum()

  def remove_duplicate(self):
    return self.df.drop_duplicates()

  def summary_stat(self):
    return self.df.describe()

  def filling_missing_values(self, value=0):
    self.df=self.df.fillna(value)

In [33]:
data_analyzer=AdvanceDataAnalyzer('Movies.csv')

In [34]:
data_analyzer.dataset_dimentions()

(559, 6)

In [35]:
data_analyzer.check_missing_values()

Film                       0
Genre                      2
RottenTomatoes_Ratings     8
Audience Ratings          11
Budget_m                   9
Year_release               3
dtype: int64

In [36]:
data_analyzer.column_names()

['Film',
 'Genre',
 'RottenTomatoes_Ratings',
 'Audience Ratings',
 'Budget_m',
 'Year_release']

In [38]:
data_analyzer.display_head(2)

Unnamed: 0,Film,Genre,RottenTomatoes_Ratings,Audience Ratings,Budget_m,Year_release
0,(500) Days of Summer,Comedy,87.0,81.0,8.0,2009.0
1,"10,000 B.C.",Adventure,9.0,44.0,105.0,2008.0


In [39]:
data_analyzer.duplicates_check()

0

In [41]:
data_analyzer.summary_stat().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
RottenTomatoes_Ratings,551.0,47.477314,26.268386,0.0,25.5,46.0,70.0,97.0
Audience Ratings,548.0,58.99635,16.819506,0.0,47.0,59.0,72.0,96.0
Budget_m,550.0,50.390909,48.986726,0.0,20.0,35.0,65.0,300.0
Year_release,556.0,2009.160072,1.361473,2007.0,2008.0,2009.0,2010.0,2011.0
