# Pandas Complete Tutorial
### By Samrat Mitra (https://www.github.com/lionelsamrat10)

## Import Libraries

In [1]:
import pandas as pd
import numpy as np

## Create a Dictionary in Python

In [4]:
dict1 = {
    "name": ['Samrat', 'Sarah', 'Dave', 'Sophie'],
    "marks": [50, 49, 47, 50],
    "city": ['Jalpaiguri', 'Los Angeles', 'Montreal', 'London']
}

## Convert the Dictionary to a Dataframe

***Convert the dictionary dict1 to a dataframe***

In [5]:
df = pd.DataFrame(dict1)

***Print the Dataframe***

In [8]:
df

Unnamed: 0,name,marks,city
0,Samrat,50,Jalpaiguri
1,Sarah,49,Los Angeles
2,Dave,47,Montreal
3,Sophie,50,London


***Convert the dataframe to csv file***

In [11]:
df.to_csv('students.csv', index = False) # The data will not have any index values

## Get the first few rows of a dataframe

In [12]:
df.head(2)

Unnamed: 0,name,marks,city
0,Samrat,50,Jalpaiguri
1,Sarah,49,Los Angeles


## Get the last few rows of a dataframe

In [13]:
df.tail(2)

Unnamed: 0,name,marks,city
2,Dave,47,Montreal
3,Sophie,50,London


## Get the shape of the dataframe

In [14]:
df.shape

(4, 3)

## Get the Description of the data

In [17]:
df.describe() # Performs statistical analysis on the data

Unnamed: 0,marks
count,4.0
mean,49.0
std,1.414214
min,47.0
25%,48.5
50%,49.5
75%,50.0
max,50.0


## Read a CSV File

In [18]:
dataframe = pd.read_csv('Salary_Data.csv')

In [19]:
dataframe

Unnamed: 0,YearsExperience,Salary
0,1.1,39343.0
1,1.3,46205.0
2,1.5,37731.0
3,2.0,43525.0
4,2.2,39891.0
5,2.9,56642.0
6,3.0,60150.0
7,3.2,54445.0
8,3.2,64445.0
9,3.7,57189.0


In [21]:
dataframe.describe()

Unnamed: 0,YearsExperience,Salary
count,30.0,30.0
mean,5.313333,76003.0
std,2.837888,27414.429785
min,1.1,37731.0
25%,3.2,56720.75
50%,4.7,65237.0
75%,7.7,100544.75
max,10.5,122391.0


In [22]:
dataframe.shape

(30, 2)

### Get the Salary of the 10th employee using the dataframe

In [25]:
dataframe['Salary'][9]

57189.0

### Reading a HTML table from the web

In [36]:
# Read HTML tables into a list of DataFrame objects.
url = r'https://en.wikipedia.org/wiki/List_of_Germans_by_net_worth'
tables = pd.read_html(url, header=0)

df_net_worth = tables[0] # tables[0] denotes the first table in the page

# Asserts can be useful for sanity checks
assert len(df_net_worth) > 0 
assert df_net_worth.Name.is_unique


df_net_worth.head() # Shows only the first five entries

Unnamed: 0,World ranking,Name,Citizenship,Net worth (USD),Sources of wealth
0,34,Beate Heister (b. Albrecht) & Karl Albrecht Jr.,Germany,39.2 billion,Aldi Süd
1,38,Dieter Schwarz,Germany,36.9 billion,Schwarz Gruppe
2,53,Susanne Klatten,Germany,27.7 billion,"BMW, Altana, Nordex, SGL Carbon"
3,58,Klaus-Michael Kühne,Germany,26.3 billion,Kuehne + Nagel
4,81,Stefan Quandt,Germany,21.6 billion,BMW


In [37]:
df_net_worth

Unnamed: 0,World ranking,Name,Citizenship,Net worth (USD),Sources of wealth
0,34,Beate Heister (b. Albrecht) & Karl Albrecht Jr.,Germany,39.2 billion,Aldi Süd
1,38,Dieter Schwarz,Germany,36.9 billion,Schwarz Gruppe
2,53,Susanne Klatten,Germany,27.7 billion,"BMW, Altana, Nordex, SGL Carbon"
3,58,Klaus-Michael Kühne,Germany,26.3 billion,Kuehne + Nagel
4,81,Stefan Quandt,Germany,21.6 billion,BMW
5,98,Theo Albrecht Jr.,Germany,18.8 billion,Aldi Nord and Trader Joe's
6,115,Reinhold Wuerth,Germany,16.8 billion,Würth Group
7,133,Georg Schaeffler,Germany,14.9 billion,Schaeffler Group
8,186,Alexander Otto,Germany,11.8 billion,Otto Group
9,200,Thomas Strüngmann,Germany,11.0 billion,"Hexal, BioNTech"


### Reading from databases is also possible.

**Reading** from Microsoft SQL using **pyodbc** and **pd.read_sql(sql_code, connection)**.

Methods on DataFrames **return a new instance** by default. In other words, they behave like methods on immutable Python object, and not like methods on mutable objects.

In [38]:
# Lists are MUTABLE
scores = [6, 2, 4, 9, 1]
scores.sort()  # Changes the object in-place, returns None
print(scores)

# Strings are IMMUTABLE
my_name = 'tommy'
my_name = my_name.capitalize()  # A new instance is returned
print(my_name)

[1, 2, 4, 6, 9]
Tommy


### How to changes the index values

In [42]:
df.index = ['first', 'second', 'third', 'fourth']

In [44]:
df # The indices are changed from 0, 1, 2, 3 to first, second, third and fourth

Unnamed: 0,name,marks,city
first,Samrat,50,Jalpaiguri
second,Sarah,49,Los Angeles
third,Dave,47,Montreal
fourth,Sophie,50,London


### Two data structures in Pandas: Series and DataFrame

In [48]:
type(df['marks']) # Its a Series

pandas.core.series.Series

In [49]:
type(df)

pandas.core.frame.DataFrame