In [2]:
import pandas as pd
import os

# Explain Pandas

https://www.youtube.com/watch?v=CmorAWRsCAw&list=PLeo1K3hjS3uuASpe-1LjfG5f14Bnozjwy

In [56]:
# Define a data frame 
df = pd.read_csv("../DATA/TestingData/Weather.csv", delimiter=";")
df

Unnamed: 0,day,windspeed,event,temperature
0,01.01.2018,50,Rain,30
1,02.01.2018,20,Sunny,31
2,03.01.2018,60,Rain,32
3,04.01.2018,80,Cloud,34
4,05.01.2018,50,Cloud,30


In [57]:
# But we can also tell it to create a table itself with data we have 
weather_data = {
    'day': ['01/01/2018', '02/01/2018', '03/01/2018', '04/01/2018', '05/01/2018'],
    'windspeed': [50, 20, 60, 80, 50],
    'event': ['Rain', 'Sunny', 'Rain', 'CLoud', 'Cloud'],
    'temperature': [30, 31, 32, 34, 30]
}
df2 = pd.DataFrame(weather_data)
df2

Unnamed: 0,day,event,temperature,windspeed
0,01/01/2018,Rain,30,50
1,02/01/2018,Sunny,31,20
2,03/01/2018,Rain,32,60
3,04/01/2018,CLoud,34,80
4,05/01/2018,Cloud,30,50


In [58]:
# Get the dimensions of a table
rows, columns = df.shape

In [59]:
# Shows the first 5 rows 
df.head()
# If we want first 3 rows: df.head(3)

Unnamed: 0,day,windspeed,event,temperature
0,01.01.2018,50,Rain,30
1,02.01.2018,20,Sunny,31
2,03.01.2018,60,Rain,32
3,04.01.2018,80,Cloud,34
4,05.01.2018,50,Cloud,30


In [60]:
df.columns
# len(df.columns)

Index([u'day', u'windspeed', u'event', u'temperature'], dtype='object')

In [61]:
# Can access different colums and see all instances of that 
df['event']
df['event'].value_counts()

Rain     2
Cloud    2
Sunny    1
Name: event, dtype: int64

In [62]:
# All elements in Pandas are of type series (like array)
type(df['event'])

pandas.core.series.Series

In [63]:
# Find the max / min / mean value of a column
df['windspeed'].max()

80

In [64]:
# Shows mean / std etc for all data columns where we have numerical data 
df.describe()

Unnamed: 0,windspeed,temperature
count,5.0,5.0
mean,52.0,31.4
std,21.679483,1.67332
min,20.0,30.0
25%,50.0,30.0
50%,50.0,31.0
75%,60.0,32.0
max,80.0,34.0


# Querying data in the data frame

In [65]:
# Query all the rows where the windspeed is greater than 40
df[df['windspeed']>40]

Unnamed: 0,day,windspeed,event,temperature
0,01.01.2018,50,Rain,30
2,03.01.2018,60,Rain,32
3,04.01.2018,80,Cloud,34
4,05.01.2018,50,Cloud,30


In [66]:
# Query the days where it was cloudy 
df[df['event']=='Cloud']

Unnamed: 0,day,windspeed,event,temperature
3,04.01.2018,80,Cloud,34
4,05.01.2018,50,Cloud,30


In [67]:
# Query where the temperature is a maximum value
df[df['temperature']==df['temperature'].max()]

Unnamed: 0,day,windspeed,event,temperature
3,04.01.2018,80,Cloud,34


In [68]:
# return only the day when that is true 
df[['day','temperature']][df['temperature']==df['temperature'].max()]

Unnamed: 0,day,temperature
3,04.01.2018,34


# All operations available

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.html

# Indexing
We can have normal indices as by df.index but we can also define or re-define a column as index column, for example the date
would be suitable as an index 

We can basically have any column as an index, but just as in spatial databases module, we must make sure that it is a unique identifier 


In [69]:
df.index

RangeIndex(start=0, stop=5, step=1)

In [70]:
# Re-define the index 
# Note: it doesnt change anything in df, unless we either
# a) write: inplace = True
# b) give it a new instance (df2 = df.set_index('day'))
df.set_index('day')

Unnamed: 0_level_0,windspeed,event,temperature
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
01.01.2018,50,Rain,30
02.01.2018,20,Sunny,31
03.01.2018,60,Rain,32
04.01.2018,80,Cloud,34
05.01.2018,50,Cloud,30


In [71]:
df.set_index('day', inplace=True)

In [72]:
df

Unnamed: 0_level_0,windspeed,event,temperature
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
01.01.2018,50,Rain,30
02.01.2018,20,Sunny,31
03.01.2018,60,Rain,32
04.01.2018,80,Cloud,34
05.01.2018,50,Cloud,30


In [74]:
# Access the data line via the new index now
df.loc['01.01.2018']

windspeed        50
event          Rain
temperature      30
Name: 01.01.2018, dtype: object

In [77]:
# Re-setting the index 
df.reset_index(inplace=True)
df

Unnamed: 0,index,day,windspeed,event,temperature
0,0,01.01.2018,50,Rain,30
1,1,02.01.2018,20,Sunny,31
2,2,03.01.2018,60,Rain,32
3,3,04.01.2018,80,Cloud,34
4,4,05.01.2018,50,Cloud,30


# Create a column and drop a column

Here, we can easily create a column 

In [87]:
# Create nonsense table 
df['new_column']=df['windspeed']/df['temperature']
df

Unnamed: 0,index,day,windspeed,event,temperature,new_column
0,0,01.01.2018,50,Rain,30,1.666667
1,1,02.01.2018,20,Sunny,31,0.645161
2,2,03.01.2018,60,Rain,32,1.875
3,3,04.01.2018,80,Cloud,34,2.352941
4,4,05.01.2018,50,Cloud,30,1.666667


In [88]:
df.drop(['new_column'], axis=1, inplace=True)
df

Unnamed: 0,index,day,windspeed,event,temperature
0,0,01.01.2018,50,Rain,30
1,1,02.01.2018,20,Sunny,31
2,2,03.01.2018,60,Rain,32
3,3,04.01.2018,80,Cloud,34
4,4,05.01.2018,50,Cloud,30
