In [3]:
import pandas as pd
import numpy as np

# Creating DataFrames

In [4]:
# Read a CSV file and turn it into a Dataframe
fruits = pd.read_csv("https://github.com/firasm/bits/raw/master/fruits.csv")

In [5]:
# Print a certain amount of lines.
    # If no arguments are given, it prints just 5 lines
        # If Dataset is less than 5 lines, it prints the entire dataset
fruits.head()

Unnamed: 0,Fruit Name,Mass(g),Colour,Rating
0,Apple,200,Red,8
1,Banana,250,Yellow,9
2,Cantoloupe,600,Orange,10


In [6]:
# Turn a list into a DataFrame
    # Each value in the list will occupy a new row, all under one column
lstOfGenders = ['Male','Female']
pd.DataFrame(lstOfGenders)

# The pd.DataFrame() functions accepts a list. Of course, this list can contain anything.


Unnamed: 0,0
0,Male
1,Female


In [7]:
    # If we have a list of lists, for example:
lst = [ [1,2], [2,3], [3,4] ]
pd.DataFrame(lst)
    # Notice how each element in the list is supposed to occupy a new row? Since each element in the list is a list, it has to create new columns to fit.
    

Unnamed: 0,0,1
0,1,2
1,2,3
2,3,4


In [8]:
# Rules for DataFrame() method
    #1) If the first element is a dataset, the rest of the elements must be the same dataset
        # Ex: pd.DataFrame( [ {'Hello':'Hi'}, (3,4,5) ]  ) --> Will not work, dictionary is the first element.
    
    #2) Cannot pass more than one list to the method
        # Ex: pd.DataFrame( [1,2,3], [4,5,6], [7,8,9]) --> Cannot do
        
        # However, as long as the second list passed is the same length as the first list, it will be used as the names of the rows
    

In [9]:
# Turn a dictionary into a dataframe
dct = {'Greetings':['Hello','Hi','Hey'], 'Farewells':['Goodbye','Bye','See you later']}
# DataFrame() method can also accept a single dictionary. The way it works is by making each key the name of a column, and having their values occupy a new row in their respective column

phrases = pd.DataFrame( dct )
phrases

Unnamed: 0,Greetings,Farewells
0,Hello,Goodbye
1,Hi,Bye
2,Hey,See you later


In [10]:
# Turn a a list of dictionaries into a dataframe

fruit1 = {'Fruit Name': 'Apple',
'Mass (g)': 200,
'Colour': 'Red',
'Rating': 8}
fruit2 = {'Fruit Name': 'Banana',
'Mass (g)': 250,
'Colour': 'Yellow',
'Rating': 9}
fruit3 = {'Fruit Name': 'Cantoloupe',
'Mass (g)': 600,
'Colour': 'Orange',
'Rating': 10}

fruits = pd.DataFrame([fruit1,fruit2,fruit3])
fruits

Unnamed: 0,Fruit Name,Mass (g),Colour,Rating
0,Apple,200,Red,8
1,Banana,250,Yellow,9
2,Cantoloupe,600,Orange,10


# Grabbing From DataFrames

#### Grab Column


In [11]:
fruits['Rating']

0     8
1     9
2    10
Name: Rating, dtype: int64

#### Grab Multiple Columns

In [12]:
f = fruits[['Rating','Fruit Name']]

#### Grab a Row

In [13]:
fruits.iloc[0] 

Fruit Name    Apple
Mass (g)        200
Colour          Red
Rating            8
Name: 0, dtype: object

In [14]:
# You can also use fruits.loc[<name of row>], which is useful if the row has a name you want grab by

#### Grab Multiple Rows

In [15]:
fruits.iloc[[0,2]]

Unnamed: 0,Fruit Name,Mass (g),Colour,Rating
0,Apple,200,Red,8
2,Cantoloupe,600,Orange,10


In [16]:
fruits.iloc[1:] # Splicing

Unnamed: 0,Fruit Name,Mass (g),Colour,Rating
1,Banana,250,Yellow,9
2,Cantoloupe,600,Orange,10


#### Grabbing Rows and Columns

In [17]:
fruits.iloc[[0,2],[1,2]] # Second parameter are the columns

Unnamed: 0,Mass (g),Colour
0,200,Red
2,600,Orange


In [40]:
fruits.iloc[:, 1:] # Splices Dataframe by Columns

Unnamed: 0,Mass (g),Colour,Rating,NaN Column
0,200,Red,8,
1,250,Yellow,9,1.0
2,600,Orange,10,


#### Grab a Specific Value in A Specific Column

In [18]:
fruits.iat[0,3]
    # fruits.iat[idxRow, idxColumn]

8

In [19]:
# You can also fruits.at[row,column], if the columns or rows have name

In [20]:
# .iloc and .loc can also be used to grab a specific value as well, in the same way the .at method work

# Turning Values Into a List

#### Turn Columns Into A List

In [48]:
lst = list(fruits)
lst

['Fruit Name', 'Mass (g)', 'Colour', 'Rating', 'NaN Column']

#### Grab The Values ( AKA  rows ) Of a Column, and Put Them Into a List

In [22]:
list(fruits['Fruit Name'])

# Dropping

#### Dropping Rows

In [50]:
fruits.drop( [i for i in range(1,3)] )

Unnamed: 0,Fruit Name,Mass (g),Colour,Rating,NaN Column
0,Apple,200,Red,8,


#### Dropping Columns

In [52]:
lst = list(fruits)[1:] # We use list splicing to splice our dataframe with drop

fruits.drop( lst, axis = 'columns' )


Unnamed: 0,Fruit Name
0,Apple
1,Banana
2,Cantoloupe


# Processing ( Dealing With NaN/Null values )

In [53]:
# Making a NaN Column
fruits['NaN Column'] = pd.DataFrame([np.nan,1,np.nan])
fruits

Unnamed: 0,Fruit Name,Mass (g),Colour,Rating,NaN Column
0,Apple,200,Red,8,
1,Banana,250,Yellow,9,1.0
2,Cantoloupe,600,Orange,10,


In [27]:
# See what's null in dataframe
fruits.isnull()

Unnamed: 0,Fruit Name,Mass (g),Colour,Rating,NaN Column
0,False,False,False,False,True
1,False,False,False,False,False
2,False,False,False,False,True


In [28]:
# Drop rows with null
fruits.dropna()

Unnamed: 0,Fruit Name,Mass (g),Colour,Rating,NaN Column
1,Banana,250,Yellow,9,1.0


In [29]:
# Drop columns with null
fruits.dropna(axis='columns')

Unnamed: 0,Fruit Name,Mass (g),Colour,Rating
0,Apple,200,Red,8
1,Banana,250,Yellow,9
2,Cantoloupe,600,Orange,10


In [30]:
# Drop columns or rows that only have null values
    # Columns
fruits.dropna(axis='columns', how='all')

# If the entire NaN Column was null, it would be dropped. The default is 'any'

Unnamed: 0,Fruit Name,Mass (g),Colour,Rating,NaN Column
0,Apple,200,Red,8,
1,Banana,250,Yellow,9,1.0
2,Cantoloupe,600,Orange,10,


In [31]:
    # You can do it by rows too, with no need to specify an axis

In [32]:
# There is also a parameter called thresh for df.dropna() function, which drops rows or columns, depending on what axis u specify, that do not have a specific number of non null values.

fruits.dropna(axis='columns', thresh = 1) # Drop columns with null that do not have at least 1 non null values

# As u can see, the NaN column is kept because it has 1 non null value.

Unnamed: 0,Fruit Name,Mass (g),Colour,Rating,NaN Column
0,Apple,200,Red,8,
1,Banana,250,Yellow,9,1.0
2,Cantoloupe,600,Orange,10,


In [33]:
# Fill null values with specific value
fruits.fillna(0)

Unnamed: 0,Fruit Name,Mass (g),Colour,Rating,NaN Column
0,Apple,200,Red,8,0.0
1,Banana,250,Yellow,9,1.0
2,Cantoloupe,600,Orange,10,0.0


In [34]:
# Fill null with previous value ( above if axis = 'rows', behind if axis='columns')
fruits.ffill(axis='columns')

Unnamed: 0,Fruit Name,Mass (g),Colour,Rating,NaN Column
0,Apple,200,Red,8,8.0
1,Banana,250,Yellow,9,1.0
2,Cantoloupe,600,Orange,10,10.0


In [35]:
# File null with next value ( below if axis = 'rows', ahead if axis = 'columns')
fruits.bfill(axis='rows')

Unnamed: 0,Fruit Name,Mass (g),Colour,Rating,NaN Column
0,Apple,200,Red,8,1.0
1,Banana,250,Yellow,9,1.0
2,Cantoloupe,600,Orange,10,


# Concatenating/Merging Dataframes

#### Concatenation

In [36]:
dct = {'Greetings':['Hello','Hi','Hey'], 'Farewells':['Goodbye','Bye','See you later']}
greets =pd.DataFrame(dct)

foo1 = pd.concat([fruits,greets], axis = 'columns')
foo2 = pd.concat([fruits,greets]) # Can also be done with fruits.append(greets) --> Only works for rows, AKA cannot specify axis
display(foo1,foo2)


# If axis = 'rows', it will be stacked on top of eachother. If axis='columns', it will be glued beside eachother
    # but if you concatenate by rows, indices are preserved. use reset_index() method to reindex

Unnamed: 0,Fruit Name,Mass (g),Colour,Rating,NaN Column,Greetings,Farewells
0,Apple,200,Red,8,,Hello,Goodbye
1,Banana,250,Yellow,9,1.0,Hi,Bye
2,Cantoloupe,600,Orange,10,,Hey,See you later


Unnamed: 0,Fruit Name,Mass (g),Colour,Rating,NaN Column,Greetings,Farewells
0,Apple,200.0,Red,8.0,,,
1,Banana,250.0,Yellow,9.0,1.0,,
2,Cantoloupe,600.0,Orange,10.0,,,
0,,,,,,Hello,Goodbye
1,,,,,,Hi,Bye
2,,,,,,Hey,See you later


#### Merging

# Changing Values

[This link](https://www.askpython.com/python-modules/pandas/update-the-value-of-a-row-dataframe) is great for showing how to change certain values
