# What is pandas?
- Pandas is software library written in python for data manupulation and data analysis 

## Pandas is well suited for different kinds of data
- tabular data with heterogeneous columns
- ordered and unordered time series data
- arbitrary metrix data with row and column labels
- any other forms of data

In [1]:
import pandas as pd

xyz_web = {'day': [1,2,3,4,5], 'visitors':[1000, 2000, 3000, 4000, 500], 'bounce_rate': [20, 30, 35, 50, 100]}

df = pd.DataFrame(xyz_web)
print(df)

   day  visitors  bounce_rate
0    1      1000           20
1    2      2000           30
2    3      3000           35
3    4      4000           50
4    5       500          100


# Pandas Operations
- Slicing the DataFrame
- Changing the Index
- Data conversions
- Joining and Merging
- Concatenation
- Changing the column headers

### Slicing

In [3]:
import pandas as pd

xyz_web = {'day': [1,2,3,4,5], 'visitors':[1000, 2000, 3000, 4000, 500], 'bounce_rate': [20, 30, 35, 50, 100]}

df = pd.DataFrame(xyz_web)
print(df)

   day  visitors  bounce_rate
0    1      1000           20
1    2      2000           30
2    3      3000           35
3    4      4000           50
4    5       500          100


In [5]:
df.head(2)

Unnamed: 0,day,visitors,bounce_rate
0,1,1000,20
1,2,2000,30


In [13]:
df.tail(2)

Unnamed: 0,day,visitors,bounce_rate
3,4,4000,50
4,5,500,100


### Merging

In [5]:
import pandas as pd

df1 = pd.DataFrame({"day":[1,2,3,4], "visitors":[1000,200, 500, 6000], "bounce_rate":[10, 20, 33, 44]},
                  index=[1,2,3,4])
df2 = pd.DataFrame({"day":[1,2,3,4], "visitors":[100, 200, 300, 400], "bounce_rate": [10, 20, 34, 44]},
                  index=[1,2,3,4])

df = pd.merge(df1, df2, on=["day","bounce_rate"]) 
print(df)
#pd.merge()
#help(pd.merge)

   day  visitors_x  bounce_rate  visitors_y
0    1        1000           10         100
1    2         200           20         200
2    4        6000           44         400


In [18]:
df  = pd.merge(df1, df2, how="outer") #how = left, right,inner,outer
print(df)

   day  visitors  bounce_rate
0    1      1000           10
1    2       200           20
2    3       500           33
3    4      6000           44
4    1       100           10
5    3       300           33
6    4       400           44


### Joining

In [7]:
df1 = pd.DataFrame({"visitors":[1000,200, 500, 6000], "bounce_rate":[10, 20, 33, 44]},
                  index=[2001, 2002, 2003, 2004])
df2 = pd.DataFrame({"employees":[100, 200, 300, 400], "attrition_rate": [10, 20, 33, 44]},
                  index=[2002, 2005, 2004, 2007])

joined = df1.join((df2))
print(joined)

      visitors  bounce_rate  employees  attrition_rate
2001      1000           10        NaN             NaN
2002       200           20      100.0            10.0
2003       500           33        NaN             NaN
2004      6000           44      300.0            33.0


### Changing the index and column headers

In [8]:
import pandas as pd

df = pd.DataFrame({"day":[1,2,3,4], "visitors":[100, 200, 300, 400], "bounce_rate":[10, 20, 30, 40]})

df = df.rename(columns = {"visitors":"users"})
print(df)
df.set_index("day", inplace=True)
print(df)

   day  users  bounce_rate
0    1    100           10
1    2    200           20
2    3    300           30
3    4    400           40
     users  bounce_rate
day                    
1      100           10
2      200           20
3      300           30
4      400           40


### Concatenation

In [10]:
import pandas as pd

df1 = pd.DataFrame({"day":[1,2,3,4], "visitors":[100, 200, 300, 400], "bounce_rate": [10, 20, 30, 40]})
df1.set_index("day", inplace=True)
df2 = pd.DataFrame({"day":[5,6,7,8], "visitors":[500, 600, 700, 800], "bounce_rate": [11,22,33,44]})
df2.set_index("day", inplace=True)
con = pd.concat([df1, df2])
print(con)

     visitors  bounce_rate
day                       
1         100           10
2         200           20
3         300           30
4         400           40
5         500           11
6         600           22
7         700           33
8         800           44


### Data Munging
- changing data from one format to another (csv to html)

In [61]:
import pandas as pd
df = pd.read_csv("/home/rajendra/Desktop/personal/AI-ML/practice/data/books.csv")

In [62]:
df.columns

Index(['bookID', 'title', 'authors', 'isbn13', 'language_code', 'price',
       'ratings_count', 'text_reviews_count'],
      dtype='object')

In [53]:
df.head()

Unnamed: 0,bookID,title,authors,isbn13,language_code,price,ratings_count,text_reviews_count
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling-Mary GrandPré,9780439785969,eng,652,1944099,26249
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling-Mary GrandPré,9780439358071,eng,870,1996446,27613
2,3,Harry Potter and the Sorcerer's Stone (Harry P...,J.K. Rowling-Mary GrandPré,9780439554930,eng,320,5629932,70390
3,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,9780439554893,eng,352,6267,272
4,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling-Mary GrandPré,9780439655484,eng,435,2149872,33964


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11003 entries, 0 to 11002
Data columns (total 10 columns):
bookID                11003 non-null int64
title                 11003 non-null object
authors               11003 non-null object
average_rating        11003 non-null object
isbn                  11003 non-null object
isbn13                11003 non-null object
language_code         11003 non-null object
# num_pages           11003 non-null object
ratings_count         11003 non-null object
text_reviews_count    11003 non-null object
dtypes: int64(1), object(9)
memory usage: 859.7+ KB


In [27]:
df.rename(columns={'# num_pages':"num_pages"}, inplace=True)

In [38]:
df.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling-Mary GrandPré,4.56,439785960,9780439785969,eng,652,1944099,26249
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling-Mary GrandPré,4.49,439358078,9780439358071,eng,870,1996446,27613
2,3,Harry Potter and the Sorcerer's Stone (Harry P...,J.K. Rowling-Mary GrandPré,4.47,439554934,9780439554930,eng,320,5629932,70390
3,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.41,439554896,9780439554893,eng,352,6267,272
4,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling-Mary GrandPré,4.55,043965548X,9780439655484,eng,435,2149872,33964


In [58]:
df.to_csv("books.csv", index=False)

In [59]:
books = pd.read_csv('books.csv')

In [60]:
books.head()

Unnamed: 0,bookID,title,authors,isbn13,language_code,price,ratings_count,text_reviews_count
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling-Mary GrandPré,9780439785969,eng,652,1944099,26249
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling-Mary GrandPré,9780439358071,eng,870,1996446,27613
2,3,Harry Potter and the Sorcerer's Stone (Harry P...,J.K. Rowling-Mary GrandPré,9780439554930,eng,320,5629932,70390
3,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,9780439554893,eng,352,6267,272
4,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling-Mary GrandPré,9780439655484,eng,435,2149872,33964


In [65]:
import pandas as pd

books = pd.read_csv("/home/rajendra/Desktop/personal/AI-ML/practice/data/books.csv")
books.to_html("books.html")

### Example

In [110]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
style.use("fivethirtyeight")
books = pd.read_csv("/home/rajendra/Desktop/personal/AI-ML/practice/data/india-statewise-information/unemployment.csv")
df = books.head(5)
df

Unnamed: 0,Rank,State,Total,Urban,Rural
0,1,Gujarat,10,15,7
1,2,Karnataka,15,19,13
2,3,Chhattisgarh,19,68,11
3,4,Maharashtra,21,23,20
4,5,Telangana,28,62,13


### Python for statistics
- mean - Average value of perticular list
- median - middle value /center value of list(high/low median for even number of elements
- mode - value that has been repeated most
- variance - what is the variation of each and every element of list from mean

In [92]:
from statistics import mean

print(mean([1,2,3,4]))

2.5


In [104]:
from statistics import median

print(median([1,7,6, 4, 5.5, 1]))   # first sorts the data and then middle value

4.75


In [107]:
from statistics import mode

print(mode([1,2,3,3,3,4,4,5,5]))  # value 3 repeated most

3


In [109]:
from statistics import variance, mean
print(mean([1,2,3,4,5]))
print(variance([1,2,3,4,5]))  #substract every number by mean # sqare each substraction # add all square # divide by n-1

3
2.5
