In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('Age-Income-Dataset.csv')
df.head()

Unnamed: 0,Age,Income
0,Young,25000
1,Middle Age,54000
2,Old,60000
3,Young,15000
4,Young,45000


In [3]:
df.isnull().sum()

Age       0
Income    0
dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Age     50 non-null     object
 1   Income  50 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 932.0+ bytes


## Mean

In [5]:
#Mean
def calMean(df):
    for col in df.columns:
        if df[col].dtype != 'O':
            temp = 0
            for i in df[col]:
                temp = temp + i
            print("Without Library -> ")
            print("Averge of {} is {}".format(col,(temp/len(df[col]))))
            print()
            print("With Library -> ")
            print(df[col].mean())

In [6]:
calMean(df)

Without Library -> 
Averge of Income is 50966.0

With Library -> 
50966.0


## Median

In [7]:
#Mean
def calMedian(df):
    for col in df.columns:
        if df[col].dtype != 'O':
            arr = sorted(df[col])

            num_rows = df.shape[0]

            middle_i = num_rows // 2
            
            if (num_rows % 2 == 0) :
                middle_ele = (arr[middle_i] + arr[middle_i - 1]) / 2
            else :
                middle_ele = arr[middle_i]
            
            print("Without Library -> ")
            print("Median of {} is {}".format(col,middle_ele))
            print()
            print("With Library -> ")
            print(df[col].median())

In [8]:
calMedian(df)

Without Library -> 
Median of Income is 46850.0

With Library -> 
46850.0


## Mode

In [9]:
def calMode(df):
    for col in df.columns:
        if df[col].dtype == 'O':
            arr = df[col].sort_values()

            prevcnt = 0
            cnt = 0
            ans = arr[0]
            temp = arr[0]

            for i in arr:
                if(temp == i) :
                    cnt += 1
                else:
                    prevcnt = cnt
                    cnt = 1
                    temp = i
                if(cnt > prevcnt):
                    ans = i
            
            print("Without Library -> ")
            print("Mode of {} is {}".format(col,ans))
            print()
            print("With Library -> ")
            print(df[col].mode())

In [10]:
calMode(df)

Without Library -> 
Mode of Age is Old

With Library -> 
0    Old
Name: Age, dtype: object


## STD

In [11]:
import math
def calSD(df):
    for col in df.columns:
        if df[col].dtype != 'O':

            meanVal = df[col].mean()

            sum = 0
            n = len(df[col])

            for i in df[col]:
                sum+= (i - meanVal)**2

            std = math.sqrt(sum/n)    
        
            print("Without Library -> ")
            print("Standard Deviation of {} is {}".format(col,std))
            print()
            print("With Library -> ")
            print(np.std(df[col]))

In [12]:
calSD(df)

Without Library -> 
Standard Deviation of Income is 20884.6509187968

With Library -> 
20884.6509187968


## Minimum Value

In [13]:
def calMinimum(df):
     for col in df.columns:
        if df[col].dtype != 'O':
            arr = sorted(df[col])
        
            print("Without Library -> ")
            print("Minimum of {} is {}".format(col,arr[0]))
            print()
            print("With Library -> ")
            print(min(df[col]))

In [14]:
calMinimum(df)

Without Library -> 
Minimum of Income is 15000

With Library -> 
15000


## Maximum Value

In [15]:
def calMax(df):
     for col in df.columns:
        if df[col].dtype != 'O':
            arr = sorted(df[col])
            n = len(df[col])
            
            print("Without Library -> ")
            print("Minimum of {} is {}".format(col,arr[n-1]))
            print()
            print("With Library -> ")
            print(max(df[col]))

In [16]:
calMax(df)

Without Library -> 
Minimum of Income is 93000

With Library -> 
93000


## Summary Statistics

In [17]:
df.groupby('Age').count()

Unnamed: 0_level_0,Income
Age,Unnamed: 1_level_1
Middle Age,15
Old,19
Young,16


In [21]:
data = df.copy()

mean_all = data.groupby('Age')['Income'].mean()
std_all = data.groupby('Age')['Income'].std()
min_all = data.groupby('Age')['Income'].min()
max_all = data.groupby('Age')['Income'].max()
median_all = data.groupby('Age')['Income'].median()

mean_all

Age
Middle Age    52453.333333
Old           53942.105263
Young         46037.500000
Name: Income, dtype: float64

In [19]:
## Total number of unique categories are 3

info = []

for i in [0,1,2]:
    temp = []
    temp.append('Mean : {}'.format(mean_all.iloc[i]))
    temp.append('Standard Deviation : {}'.format(std_all.iloc[i]))
    temp.append('Minimum : {}'.format(min_all.iloc[i]))
    temp.append('Maximum : {}'.format(max_all.iloc[i]))
    temp.append('Median : {}'.format(median_all.iloc[i]))
    info.append(temp)
    
info

[['Mean : 52453.333333333336',
  'Standard Deviation : 20497.800114251517',
  'Minimum : 25600',
  'Maximum : 93000',
  'Median : 53200.0'],
 ['Mean : 53942.10526315789',
  'Standard Deviation : 20868.165968220423',
  'Minimum : 24500',
  'Maximum : 89700',
  'Median : 45300.0'],
 ['Mean : 46037.5',
  'Standard Deviation : 22356.859499193233',
  'Minimum : 15000',
  'Maximum : 87000',
  'Median : 41500.0']]

In [20]:
diff_categories=['Middle Age','Old','Young']
for i in [0,1,2]:
    print('*****Statistics of {}*****'.format(diff_categories[i]))
    for i in info[i]:
        print(i)
    
    print('-------------------------------------------------------')

*****Statistics of Middle Age*****
Mean : 52453.333333333336
Standard Deviation : 20497.800114251517
Minimum : 25600
Maximum : 93000
Median : 53200.0
-------------------------------------------------------
*****Statistics of Old*****
Mean : 53942.10526315789
Standard Deviation : 20868.165968220423
Minimum : 24500
Maximum : 89700
Median : 45300.0
-------------------------------------------------------
*****Statistics of Young*****
Mean : 46037.5
Standard Deviation : 22356.859499193233
Minimum : 15000
Maximum : 87000
Median : 41500.0
-------------------------------------------------------
