# Pandas

Pandas is a Python package that provides fast, flexible, and expressive data structures designed to make working
with "relational" or "labeled" data both easy and intuitive. It aims to be the fundamental high-level building 
block for doing practical, real world data analysis in Python.

In [112]:
#to use pandas we have to import pandas library
#to install pandas -- pip install pandas
import pandas as pd
import numpy as np

In [113]:
#Creating a Series by passing a list of values, letting pandas create a default integer index:
s1 = pd.Series([1,3,9,8,4,5,9])
s1


0    1
1    3
2    9
3    8
4    4
5    5
6    9
dtype: int64

In [114]:
#to get 5 rows
s1[0:6]

0    1
1    3
2    9
3    8
4    4
5    5
dtype: int64

In [115]:
#prints the list in reverse order
s1[::-1]

6    9
5    5
4    4
3    8
2    9
1    3
0    1
dtype: int64

6    9
5    5
4    4
3    8
2    9
1    3
0    1
dtype: int64

In [116]:
#assigning alternate index
s2 = pd.Series([0,1,2,3,4,9],index = ['r0','r1','r2','r3','r4','r5'])
s2

In [117]:
#acessing the values through alnernate index
s2['r0':'r7']

In [118]:
#Creating a Series by passing a list of values, letting pandas create a default integer index
dates = pd.date_range("20130101", periods=6)
dates

In [119]:
#assigning dates as an index
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

In [120]:
#creating a series using arange of function arange function is available in numpy so import numpy
import numpy as np
s3 = pd.Series(np.arange(10))
s3

In [121]:
#create a pandas series with 100 zero values
import numpy as np
s4 = pd.Series(np.zeros(100))
s4


In [122]:
#Creating a DataFrame by passing a dictionary of objects that can be converted into a series-like structure:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)


df2

In [123]:
#to read a dataset
df = pd.read_csv('mtcars.csv')

In [124]:
#displays the whole dataset
df

In [125]:
#displays the particular column
df.disp

In [126]:
df['cyl']

In [127]:
#displays first 5 columns
df.head()

In [128]:
#displays last 10 columns
df.tail(10)

In [129]:
#displays no of rows and columns
df.shape

In [130]:
#to print the columns
df.columns

In [131]:
#to print specific column use df['model']or
df.model

In [132]:
#to display random columns
df[['model','disp']]

In [133]:
#to display a index range
df.index

In [134]:
#to print the list in array format
df.to_numpy()

# STRING FUNCTIONS

In [135]:
#LOWER CASE
s = pd.Series(["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"])
s.str.lower()

In [136]:
data = pd.DataFrame(np.random.randn(5, 4))
data

In [137]:
#combining series and dataset
pieces = [df[:3], df[3:7], df[7:]]
pieces

In [138]:
pd.concat(pieces)

In [139]:
#shows if there is any null values present in the any columns
data.isnull().sum()

In [140]:
#displays true is null value is present and false if it is not
data.isnull().any()

In [141]:
#descriptive statistics summary of a the dataframe.
#This includes mean, count, std deviation, percentiles, and min-max values of all the features.
df.describe()

In [142]:
#transposes the dataset
df.T

In [143]:
#it sort the columns in ascending order
df.sort_index(axis=1, ascending=True)

In [144]:
#it sorts the columns in descending order
df.sort_index(axis=1, ascending=False)

# slicing

In [145]:
#to select particular rows
df[5:20]

# iloc and loc

In [146]:
#Purely integer-location based indexing for selection by position
df.iloc[8:16,3:8]

In [147]:
df

In [148]:
#print from datsun to lotus europa and columns from mpg to gear
df.iloc[2:28,0:11]

In [149]:
#using iloc select cars from valiant to  fiat 128 and columns model to hp
df.iloc[5:18,0:5]

In [150]:
#print all rows and 5 columns
df.iloc[:,:5]

In [151]:
df

In [152]:
#getting particular information
#using iloc camaro z28,Volvo 142E,Duster 360
#hp,mpg,am,gear
df.iloc[[23,31,6],[0,1,4,9,10]]

# loc

Access a group of rows and columns by label(s) or a boolean array.

In [153]:
df = pd.read_csv('mtcars.csv')

In [154]:
df

In [155]:
#loc function gives particular row information 
df.loc[0]

In [156]:
#displays 31st row information
df.loc[31]

In [157]:
#displays rows from 2to 5 and columns model to hp
df.loc[2:5,'model':'hp']

In [158]:
df.head(2)

In [159]:
#displays 0 to 6 rows and specified random columns
df.loc[:6,['model','cyl','am','hp']]

In [160]:
df

In [161]:
#question cars from datsun 710 - valiant 
#columns-model,wt,cyl,am,carb
df.loc[2:5,['model','wt','cyl','am','carb']]

# to_list

In [162]:
df.columns.to_list() #prints list of  columns

In [163]:
df.index.to_list()#prints index

# creating a dataframe from scratch

In [164]:
pd.DataFrame([[1,2,3,4,5],[7,8,9,10,11],[12,13,14,15,16]])

In [165]:
import numpy as np

In [166]:
#using arange and reshape function from numpy
s5 = pd.DataFrame(np.arange(24).reshape(4,6))
s5

In [167]:
#to add column names and row names
s5.columns = ['c1','c2','c3','c4','c5','c6']
s5.index = ['r1','r2','r3','r4']
s5

In [168]:
df.head()

In [169]:
df.info()

In [170]:
#tells about datatype
s5.info()

In [171]:
s5.describe()

In [172]:
df.head()

In [173]:
#prints rows greater than 100 in disp column
df[df.disp>100]

In [174]:
df[df.disp>100].shape

In [175]:
df[df.hp>100].shape

In [176]:
df[(df.mpg > 20) & (df.hp>110)].shape

In [177]:
#list the cars in which am = 1 and mpg >20
#list how many cars
df[(df.am==1) & (df.mpg >20)]

In [178]:
df[(df.am==1) & (df.mpg >20)].shape

# sorting 

In [179]:
#sorting in ascending by vlaues
df.sort_values('mpg')

In [180]:
#sort descending
df.sort_values('mpg',ascending = False)

In [181]:
#sorting in both ascending and descending
df.sort_values(['cyl','hp'],ascending = [False,True])

# Unique

In [182]:
#Unique-->tells how many categories
#check for categorial values not for continuous values
df.am.unique()

In [183]:
df.gear.unique()

In [184]:
df.carb.unique()

In [185]:
df.cyl.unique()

In [186]:
#to check counts of categorial values
df.am.value_counts()

In [187]:
df.gear.value_counts()

In [188]:
#to find the maximum no of gears
df['gear'].max()

In [189]:
#to print the models which have maximum of gears
df[['model','gear']][df['gear']==df['gear'].max()]

In [190]:
df

# Merge the two datasets

In [191]:
#the default join is inner join 
d1 = pd.DataFrame({'name':['Virat','Rahul','Shreyas','Rohit','Dhoni'],'Match1':[169,51,60,108,53]})
d1

In [192]:
d2 = pd.DataFrame({'name':['Virat','Rahul','Shreyas','Rohit'],'Match2':[121,102,45,90]})
d2

In [193]:
dm = pd.merge(d1,d2,on = 'name')
dm

In [194]:

d3 = pd.DataFrame({'name':['Virat','Rahul','Shreyas','Rohit','Rishabh'],'Match3':[89,145,162,45,120]})
d3

In [195]:
#since the default join is inner join we get only intersection of row
#merging three datasets
#step1 first merge two datasets and then merge with third one
dmm = pd.merge(dm,d3,on = 'name')
dmm

# merge with outer join

In [196]:
#to get all the rows
dm1 = pd.merge(d1,d2, on = 'name', how = 'outer')
dm1

In [197]:
#thus we get all the players name by outer join
#indicator method add extra column merge to tell how it is added
df = pd.merge(dm1,d3, on = 'name', how = 'outer' , indicator = True)
df

# merge with left join

In [198]:
dfl = pd.merge(d1, d2, on = 'name', how = 'left')
dfl

# merge with right join

In [199]:
#indicator function create _merge column and tells how it is merged
dfr = pd.merge(d2, d3, on = 'name', how = 'right', indicator = True)
dfr

In [200]:
d1 = pd.DataFrame({'name':['Virat','Rahul','Shreyas','Rohit','Dhoni'],'Match':[169,51,60,108,53]})
d1

In [201]:
d2 = pd.DataFrame({'name':['Virat','Rahul','Shreyas','Rohit'],'Match':[121,102,45,90]})
d2

In [202]:
dfn = pd.merge(d1,d2, on = "name")
dfn

In [203]:
#we can also change x and y to differet suffix
dfn = pd.merge(d1,d2, on = "name", suffixes = ('_first', '_second'))
dfn

# Treating NAN values

In [204]:
import pandas as pd
DF =  pd.read_csv('movie_scores.csv')

In [205]:
DF

In [206]:
#finding the no of rows and columns
DF.shape

In [207]:
DF.info()

In [208]:
#identifing nan value columns
DF.isnull().any()

In [209]:
#identifying no of null values
DF.isnull().sum()

In [210]:
#using fillna function fills with specified values i.e the null values are filled with 100
DF1 = DF.fillna(100)
DF1

# Forward Fill

In [211]:
#using forward fill-->fills with forward vales
DF2 = DF.fillna(method='ffill')
DF2

In [212]:
#with limit function --> limit function used to limit the fill by value specified
DF2 = DF.fillna(method='ffill', limit = 1)
DF2

In [213]:
#interpolate replaces nan values with predicted values
new_df = DF.interpolate()
new_df

# Backward Fill

In [214]:
#in this method the values are filled with backward values
DF3 = DF.fillna(method = 'bfill')
DF3

# Fill with mean or median

In [215]:
#if categorial values replace null fields mode
#for numerical columns replace with mean or median

In [216]:
DF

In [217]:
DF4 = DF.fillna((DF.median()))
DF4

In [218]:
DF5 = DF4.fillna((DF.mode()))
DF5

In [219]:
#using replace function-Replacing incorrect values with nan value
new_df = DF.replace([2.0,8.0],np.NaN)
new_df

In [220]:
#replace values based on specific columns
new_df = DF.replace({
    'age': 51.0,
    'pre_movie_score' : 6.0
},np.NaN)
new_df

In [221]:
#to replace with specific column
new_df = DF.replace({
    "Tom" : 'Steve',
    63 : 36
})
new_df

# using Dropna

In [222]:
#use only if we have minimum errors and in large datasets
DF6 = DF.dropna()
DF6
