In [5]:
!python3 -m pip install pandas



In [6]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Series (1D)

In [17]:
data = np.array([1, 2,5, 8])
index = ['a', 'b', 'c', 'd']
s1 = pd.Series(data, index=index)

print(s1)

a    1
b    2
c    5
d    8
dtype: int64


## Index
 * Main difference between Pandas Series and Numpy Array is index in former.
 * In that way, Pandas Series is like a middle-ground for lists and dictionaries.

In [20]:
data = np.array([3, 2,8, 9])
index = ['d', 'b', 'c', 'a']
s2 = pd.Series(data, index=index)

print(s2)

d    3
b    2
c    8
a    9
dtype: int64


### Position VS Index

In [27]:
data = np.array([3, 2,8, 9])
index = ['d', 'b', 'c', 'e']
s2 = pd.Series(data, index=index)

print(s2)

d    3
b    2
c    8
e    9
dtype: int64


In [30]:
s = s1.add(s2, fill_value=0)

print(s)

a     1.0
b     4.0
c    13.0
d    11.0
e     9.0
dtype: float64


### Default Indices

## Vectorized Operations
Cases for v1 v2:

 * Both have same indices
 * Overlapping indices, but not same
 * Different indices

In [26]:
s = s1 * 3
print(s)

a    2
b    3
c    6
d    9
dtype: int64


### Fill Value and Drop NaN¶

### Apply lambda on Series

In [34]:


s = (s1 ).apply(lambda x : x > 2 and x < 10)

print(s1)

a    1
b    2
c    5
d    8
dtype: int64


## Dataframe (2D)

#### Dataframe from python dict

In [42]:
dict = {
    "age" : [10, 22, 24, 50],
    "name" : ["Mohit", "Shubham", "Rohit", "Rohan"]
}

df = pd.DataFrame(dict, index=dict["name"])

print(df)

         age     name
Mohit     10    Mohit
Shubham   22  Shubham
Rohit     24    Rohit
Rohan     50    Rohan


#### Dataframe from python numpy

In [68]:
data = np.arange(20).reshape(-1, 2)

v = data.reshape(-1, 4)

frame = pd.DataFrame(v, columns=["one", "two", "three", "four"])

frame = frame.set_index(frame.three)

# print(frame)

print(frame.values)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]
 [16 17 18 19]]


#### reading, saving from file (csv, excel)

In [73]:
data = pd.read_csv("../dataset/movie_metadata.csv")


#### Dataframe default index vs changing index

In [90]:
type(data.head())

# print(data.head())

print(type(data[["color", "director_name"]]))
data = data[["color", "director_name"]]

<class 'pandas.core.frame.DataFrame'>


In [95]:
data["d_f_name"] = data["director_name"].apply(lambda x : str(x).split(" ")[0])
data["d_s_name"] = data["director_name"].apply(lambda x : str(x).split(" ")[-1])

print(data.head())

   color      director_name     d_f_name   d_s_name
0  Color      James Cameron        James    Cameron
1  Color     Gore Verbinski         Gore  Verbinski
2  Color         Sam Mendes          Sam     Mendes
3  Color  Christopher Nolan  Christopher      Nolan
4    NaN        Doug Walker         Doug     Walker


#### Series as attributes

#### Re-arrange and renaming of in dataframe

In [101]:
data.rename(columns = {"d_f_name" : "first"}, inplace=True)

print(data)

                 color       director_name        first     d_s_name
0                Color       James Cameron        James      Cameron
1                Color      Gore Verbinski         Gore    Verbinski
2                Color          Sam Mendes          Sam       Mendes
3                Color   Christopher Nolan  Christopher        Nolan
4                  NaN         Doug Walker         Doug       Walker
5                Color      Andrew Stanton       Andrew      Stanton
6                Color           Sam Raimi          Sam        Raimi
7                Color        Nathan Greno       Nathan        Greno
8                Color         Joss Whedon         Joss       Whedon
9                Color         David Yates        David        Yates
10               Color         Zack Snyder         Zack       Snyder
11               Color        Bryan Singer        Bryan       Singer
12               Color        Marc Forster         Marc      Forster
13               Color      Gore V

#### Creating new Series

In [104]:
data[:10].to_excel("../dataset/updated.xlsx")