# Pandas 
# - Common mistakes 
# - Reading Documentation + Google Search +  Fix bugs
# - Code walkthrough of Common operations
# Pre - requisites : LIVE sessions in Python programming + Course videos

In [1]:
# now we will see in pandas from data.csv in the same folder
import pandas as pd
df = pd.read_csv(r'D:\LEARNING\data.csv')
print(df.to_string())

     date  temperature_F  temperature_C  humidity_pct  Events  is_rain
0   Jan 1             27          -2.78            45   sunny    False
1   Jan 2             31          -0.56            50  cloudy    False
2   Jan 3             23          -5.00            55   rainy     True
3   Jan 4             34           1.11            60   sunny    False
4   Jan 5             37           2.78            65   sunny    False
5   Jan 6             38           3.33            70   rainy     True
6   Jan 7             29          -1.67            75  cloudy    False
7   Jan 8             30          -1.11            80     fog    False
8   Jan 9             35           1.67            85   rainy     True
9  Jan 10             30          -1.11            90  cloudy    False


In [2]:
# Q : Raw-string for filenames
import pandas as pd 
df = pd.read_csv(r'D:\LEARNING\data.csv') # r = raw string

raw_string = r'Hi\nHello'
print(raw_string)

s = 'Hi\nHello'
print(s)

Hi\nHello
Hi
Hello


In [3]:
# Dealing with file-paths across various OS.

# Python 3 has pathlib to simplify (things like) file-path manipulations.
import pandas as pd
from pathlib import Path

data_folder = Path("D:/LEARNING") # give the folder path

file_to_open = data_folder / "data.csv"
df = pd.read_csv(file_to_open)

print(df) # printing the DataFrame

     date  temperature_F  temperature_C  humidity_pct  Events  is_rain
0   Jan 1             27          -2.78            45   sunny    False
1   Jan 2             31          -0.56            50  cloudy    False
2   Jan 3             23          -5.00            55   rainy     True
3   Jan 4             34           1.11            60   sunny    False
4   Jan 5             37           2.78            65   sunny    False
5   Jan 6             38           3.33            70   rainy     True
6   Jan 7             29          -1.67            75  cloudy    False
7   Jan 8             30          -1.11            80     fog    False
8   Jan 9             35           1.67            85   rainy     True
9  Jan 10             30          -1.11            90  cloudy    False


In [4]:
# Q: what's the difference between  df['index'] & df[['index']]

df['date'] # Series
df[['date']] # DataFrame

Unnamed: 0,date
0,Jan 1
1,Jan 2
2,Jan 3
3,Jan 4
4,Jan 5
5,Jan 6
6,Jan 7
7,Jan 8
8,Jan 9
9,Jan 10


In [5]:
print(type(df['date']))  # Series is a vector
print(type(df[['date']])) # DataFrame is a sequence of series object, DF:matrix

# So, the series is the data structure for a single column of a DataFrame, while the DataFrame is a collection of Series objects.
# Not Only Conceptually, but literally i.e
# tha data in a Dataframe  is actually stored in memory as a collection of Series objects.

<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>


In [6]:
print(df[['date', 'temperature_F']]) # if you to print 2 or more columns at once put in double brackets.

     date  temperature_F
0   Jan 1             27
1   Jan 2             31
2   Jan 3             23
3   Jan 4             34
4   Jan 5             37
5   Jan 6             38
6   Jan 7             29
7   Jan 8             30
8   Jan 9             35
9  Jan 10             30


In [7]:
#loc: label-based accessing
print(df.loc[0]) # first row
print(df.loc[:, ['date', 'temperature_F']]) # all rows for specific columns # list of columns names


date             Jan 1
temperature_F       27
temperature_C    -2.78
humidity_pct        45
Events           sunny
is_rain          False
Name: 0, dtype: object
     date  temperature_F
0   Jan 1             27
1   Jan 2             31
2   Jan 3             23
3   Jan 4             34
4   Jan 5             37
5   Jan 6             38
6   Jan 7             29
7   Jan 8             30
8   Jan 9             35
9  Jan 10             30


In [8]:
# Q: want to know both the date and temperature, when it was raining?
# print(df.loc[df["Events"] == "rainy", ["date", "temperature_F"]]) # use .loc functions for that 

df[df["Events"] == "rainy"][["date", "temperature_F"]] # another way to do the same thing without .loc function

# don't put in variable, unnecessary creating more variables...

Unnamed: 0,date,temperature_F
2,Jan 3,23
5,Jan 6,38
8,Jan 9,35


In [9]:
#Q: why does this not work for me ?

df[["date"]] [df[["Events"]] == "rainy"] #  this does not work because the filtering should be done first before selecting the column.
# remove single bracket in events , 1 is more that's bcoz not working...

Unnamed: 0,date
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,


In [10]:
print(type(df[["Events"]] == "rainy")) # <class 'pandas.core.frame.DataFrame'>

<class 'pandas.core.frame.DataFrame'>


In [11]:
#Q: how does this works?

df["date"] [df["Events"] == "rainy"] # indexing using boolean series.

2    Jan 3
5    Jan 6
8    Jan 9
Name: date, dtype: object

In [12]:
print(type(df.Events == "rainy"))  # <class 'pandas.core.series.Series'>

<class 'pandas.core.series.Series'>


In [13]:
#Q: what if individual values in a CSV file haves comma? [good boundary case (practice)]

# create a df from list:

df_test = pd.DataFrame([[11, 12, 13, 14], [21, 22, 23, 24], [31, 32, 33, 34]])
print(df_test.to_string())

    0   1   2   3
0  11  12  13  14
1  21  22  23  24
2  31  32  33  34


In [14]:
df_test = pd.DataFrame([[11, 12, 13, 14], 
                        [21, 22, 23, 24],
                        [31, 32, 33, '3,400']]) # note the comma in the last element , treat as numeric , string for that when you use '3,400'
print(df_test.to_string())

    0   1   2      3
0  11  12  13     14
1  21  22  23     24
2  31  32  33  3,400


In [15]:
print(df_test.iloc[2, 3]) # indexed location
print(type(df_test.iloc[2, 3])) # <class 'str'>

3,400
<class 'str'>


In [16]:
# Q: Max temperature day: alternative Solution

# max() function to get the maximum value in a Series.
print(df["temperature_F"].max())

print(df.date[df.temperature_F == df.temperature_F.max()])

38
5    Jan 6
Name: date, dtype: object


In [17]:
# indexing errors
df[2:5] # 5 is exclusive

Unnamed: 0,date,temperature_F,temperature_C,humidity_pct,Events,is_rain
2,Jan 3,23,-5.0,55,rainy,True
3,Jan 4,34,1.11,60,sunny,False
4,Jan 5,37,2.78,65,sunny,False


In [18]:
# loc : rows and columns
# iloc : integer-location based indexing
df = pd.DataFrame([1, 2, 3, 4, 5 ,6, 7, 8, 9], index=['a','b','c','d','e','f','g','h','i'])
print(df.to_string())

   0
a  1
b  2
c  3
d  4
e  5
f  6
g  7
h  8
i  9


In [19]:
print(df.loc[ : 'd']) # rows and columns

   0
a  1
b  2
c  3
d  4


In [20]:
print(df.iloc[ :-4]) # integer-location based indexing (skip the last 4 rows)
# and this not work with loc function

   0
a  1
b  2
c  3
d  4
e  5
