In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Analytics and Statistics using Python
## L11:Pandas


<img src='../../prasami_images/prasami_color_tutorials_small.png' width='400' alt="By Pramod Sharma : pramod.sharma@prasami.com" align = "left"/>

In [2]:
import pandas as pd


In [3]:
pd.__version__

'2.2.2'

In [4]:
num_list = [10,20,30,40,50]
s = pd.Series(num_list)
s

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [5]:
num_idx = ['a','b','c','d','e']
s = pd.Series(num_list, index=num_idx)
s

a    10
b    20
c    30
d    40
e    50
dtype: int64

In [6]:
dct = {'name': 'PKS', 'city': 'Pune', 'country':'India'}
pd.Series(dct)

name         PKS
city        Pune
country    India
dtype: object

In [7]:
data = [[1,2,3], [4,5,6], [7,'80',9]]
data_df = pd.DataFrame(data, columns=['col1', 'col2', 'col3'])
data_df

Unnamed: 0,col1,col2,col3
0,1,2,3
1,4,5,6
2,7,80,9


In [8]:
dct = {'name': ['PKS', 'AAA', 'BBB'], 
       'city': ['Pune', 'Mumbai', 'Indore'],
       'country':['India', 'I', 'Ind']}
pd.DataFrame(dct)

Unnamed: 0,name,city,country
0,PKS,Pune,India
1,AAA,Mumbai,I
2,BBB,Indore,Ind


In [9]:
dct = {'name': ['PKS', 'Soham', 'Arti'], 
       'city': ['Pune', 'London', 'Stockholm'],
       'country':['India', 'UK', 'Sweden']}
data_df = pd.DataFrame(dct)
data_df

Unnamed: 0,name,city,country
0,PKS,Pune,India
1,Soham,London,UK
2,Arti,Stockholm,Sweden


In [10]:
data_df['weight'] = [74, 78, 61]
data_df['height'] = [173, 175, 169]
data_df

Unnamed: 0,name,city,country,weight,height
0,PKS,Pune,India,74,173
1,Soham,London,UK,78,175
2,Arti,Stockholm,Sweden,61,169


In [11]:
data_df['height'] = data_df['height'] * 0.01


In [12]:
'''
BMI = Weight/(Height**2)
'''
data_df['bmi'] = data_df['weight']/data_df['height']**2
data_df

Unnamed: 0,name,city,country,weight,height,bmi
0,PKS,Pune,India,74,1.73,24.725183
1,Soham,London,UK,78,1.75,25.469388
2,Arti,Stockholm,Sweden,61,1.69,21.357796


In [13]:
data_df['bmi'] = data_df['bmi'].round(2)
data_df

Unnamed: 0,name,city,country,weight,height,bmi
0,PKS,Pune,India,74,1.73,24.73
1,Soham,London,UK,78,1.75,25.47
2,Arti,Stockholm,Sweden,61,1.69,21.36


In [14]:
data_df['Year'] = ['1769', '1985', '1990']
data_df

Unnamed: 0,name,city,country,weight,height,bmi,Year
0,PKS,Pune,India,74,1.73,24.73,1769
1,Soham,London,UK,78,1.75,25.47,1985
2,Arti,Stockholm,Sweden,61,1.69,21.36,1990


In [15]:
data_df.index

RangeIndex(start=0, stop=3, step=1)

In [16]:
curr_yr = pd.Series(2024, index=[0,1,2])
curr_yr

0    2024
1    2024
2    2024
dtype: int64

In [17]:
data_df['curr_yr'] = curr_yr
data_df

Unnamed: 0,name,city,country,weight,height,bmi,Year,curr_yr
0,PKS,Pune,India,74,1.73,24.73,1769,2024
1,Soham,London,UK,78,1.75,25.47,1985,2024
2,Arti,Stockholm,Sweden,61,1.69,21.36,1990,2024


In [18]:
# data_df['age'] = data_df['curr_yr'] - data_df['Year']


In [19]:
data_df['Year'].dtype

dtype('O')

In [20]:
import numpy as np
data_df['Year'] =data_df['Year'].astype(np.int16)


In [21]:
data_df['Year'].dtype

dtype('int16')

In [22]:
data_df['age'] = data_df['curr_yr'] - data_df['Year']

In [23]:
data_df

Unnamed: 0,name,city,country,weight,height,bmi,Year,curr_yr,age
0,PKS,Pune,India,74,1.73,24.73,1769,2024,255
1,Soham,London,UK,78,1.75,25.47,1985,2024,39
2,Arti,Stockholm,Sweden,61,1.69,21.36,1990,2024,34


In [26]:
data_df['age'] =data_df['age'].astype(np.float32)
mean = (39+34)/2

In [27]:
data_df.loc[data_df['age'] > 120, 'age'] = mean
data_df

Unnamed: 0,name,city,country,weight,height,bmi,Year,curr_yr,age
0,PKS,Pune,India,74,1.73,24.73,1769,2024,36.5
1,Soham,London,UK,78,1.75,25.47,1985,2024,39.0
2,Arti,Stockholm,Sweden,61,1.69,21.36,1990,2024,34.0


## Read Data from CSV file

In [29]:
data_path = '../../input/machine_learning/movie_rating.csv'

data_df = pd.read_csv(data_path)
data_df.head()

Unnamed: 0,Name,Year,Rating,Description,Genre_1,Genre_2,Genre_3,Certificate,Runtime
0,Sunset Blvd.,1950,8.7,A hack screenwriter writes a screenplay for a ...,Drama,Film-Noir,,APPROVED,110
1,12 Angry Men (1957),1957,8.9,A dissenting juror in a murder trial slowly ma...,Crime,Drama,,APPROVED,96
2,Dr. Strangelove or: How I Learned to Stop Worr...,1964,8.5,An insane general triggers a path to nuclear h...,Comedy,,,APPROVED,95
3,The Godfather,1972,9.2,"""The aging patriarch of an organized crime dy...",Crime,Drama,,R,175
4,The Godfather: Part II (1974),1974,9.1,The early life and career of Vito Corleone in ...,Crime,Drama,,R,202


In [30]:
data_df.columns

Index(['Name', 'Year', 'Rating', 'Description', 'Genre_1', 'Genre_2',
       'Genre_3', 'Certificate', 'Runtime'],
      dtype='object')

In [31]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Name         10 non-null     object 
 1   Year         10 non-null     int64  
 2   Rating       10 non-null     float64
 3   Description  10 non-null     object 
 4   Genre_1      10 non-null     object 
 5   Genre_2      9 non-null      object 
 6   Genre_3      2 non-null      object 
 7   Certificate  10 non-null     object 
 8   Runtime      10 non-null     int64  
dtypes: float64(1), int64(2), object(6)
memory usage: 848.0+ bytes


In [33]:
data_df.dtypes

Name            object
Year             int64
Rating         float64
Description     object
Genre_1         object
Genre_2         object
Genre_3         object
Certificate     object
Runtime          int64
dtype: object

In [34]:
data_df.xs(2)

Name           Dr. Strangelove or: How I Learned to Stop Worr...
Year                                                        1964
Rating                                                       8.5
Description    An insane general triggers a path to nuclear h...
Genre_1                                                   Comedy
Genre_2                                                      NaN
Genre_3                                                      NaN
Certificate                                             APPROVED
Runtime                                                       95
Name: 2, dtype: object

In [35]:
data_df.iloc[2]

Name           Dr. Strangelove or: How I Learned to Stop Worr...
Year                                                        1964
Rating                                                       8.5
Description    An insane general triggers a path to nuclear h...
Genre_1                                                   Comedy
Genre_2                                                      NaN
Genre_3                                                      NaN
Certificate                                             APPROVED
Runtime                                                       95
Name: 2, dtype: object

In [36]:
data_df.loc[2, 'Description']

'An insane general triggers a path to nuclear holocaust that a war room full of politicians and generals frantically try to stop.'

In [37]:
data_df.describe()

Unnamed: 0,Year,Rating,Runtime
count,10.0,10.0,10.0
mean,1978.8,8.87,143.7
std,19.35516,0.286938,39.763328
min,1950.0,8.5,95.0
25%,1966.0,8.625,110.75
50%,1974.5,8.9,147.0
75%,1994.0,9.075,169.75
max,2008.0,9.3,202.0


In [38]:
data_df[data_df.Rating > 9]

Unnamed: 0,Name,Year,Rating,Description,Genre_1,Genre_2,Genre_3,Certificate,Runtime
3,The Godfather,1972,9.2,"""The aging patriarch of an organized crime dy...",Crime,Drama,,R,175
4,The Godfather: Part II (1974),1974,9.1,The early life and career of Vito Corleone in ...,Crime,Drama,,R,202
6,The Shawshank Redemption,1994,9.3,Two imprisoned men bond over a number of years...,Crime,Drama,,R,142
