# Pandas
- Pandas is an open-source software library for the Python programming language, primarily used for data manipulation and analysis.
- It provides data structures and operations for manipulating numerical tables and time series, making it a fundamental tool in the data science ecosystem. 

In [5]:
import pandas as pd

### Series

In [10]:
A = pd.Series([2,3,4,5], index=['a','b','c','d'])
print(A.values)
print(type(A.values))
print(type(A))
print(A['a'])
print(A['a':'c'])

[2 3 4 5]
<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>
2
a    2
b    3
c    4
dtype: int64


In [17]:
grads_dict = {'A':4,'B':3.5,'C':3,'D':2.5}
grads = pd.Series(grads_dict)
print(grads.values)

marks_dict = {'A':85,'B':75,'C':65,'D':55}
marks = pd.Series(marks_dict)
print(marks)

print(marks['A'])
print(marks[0:2])

[4.  3.5 3.  2.5]
A    85
B    75
C    65
D    55
dtype: int64
85
A    85
B    75
dtype: int64


### DataFrames
- 2d dimensional data

In [24]:
D = pd.DataFrame({'Marks':marks, 'Grades':grads})
print(D)
print(D.T)

print(D.values[2,0])
print(D.columns)

D['ScaledMarks'] = 100*(D['Marks']/90)  # Add a new column
print(D)
del(D['ScaledMarks'])  # Delete column
print(D)

   Marks  Grades
A     85     4.0
B     75     3.5
C     65     3.0
D     55     2.5
           A     B     C     D
Marks   85.0  75.0  65.0  55.0
Grades   4.0   3.5   3.0   2.5
65.0
Index(['Marks', 'Grades'], dtype='object')
   Marks  Grades  ScaledMarks
A     85     4.0    94.444444
B     75     3.5    83.333333
C     65     3.0    72.222222
D     55     2.5    61.111111
   Marks  Grades
A     85     4.0
B     75     3.5
C     65     3.0
D     55     2.5


### NaN

In [37]:
A = pd.DataFrame([{'a':1,'b':4},{'b':-3,'c':9}])
print(A)
print(A.fillna(0))



     a  b    c
0  1.0  4  NaN
1  NaN -3  9.0
     a  b    c
0  1.0  4  0.0
1  0.0 -3  9.0


### loc
- To handle explicit and implicit indexing
- In implicit indexing while slicing last index is considered while in explicit not.

In [43]:
A = pd.Series(['a','b','c'], index=[1,3,5])   # Here a,b,c are implicit endex and 1,3,5 are explicit index
print(A[1])        # explicit index
print(A[1:3])      # implicit index
print(A.loc[1:3])  # explicit index -> 1 and 3
print(A.iloc[1:3]) # implicit index

print(D)
print(D.iloc[2,:])
print(D.iloc[::-1,:])

a
3    b
5    c
dtype: object
1    a
3    b
dtype: object
3    b
5    c
dtype: object
   Marks  Grades
A     85     4.0
B     75     3.5
C     65     3.0
D     55     2.5
Marks     65.0
Grades     3.0
Name: C, dtype: float64
   Marks  Grades
D     55     2.5
C     65     3.0
B     75     3.5
A     85     4.0


## csv files

In [48]:
from sklearn.impute import SimpleImputer

In [63]:
# Reading Data
df = pd.read_csv('/Users/compact.csv')
print(df.head(10))

       country        date  total_cases  new_cases  new_cases_smoothed  \
0  Afghanistan  2020-01-01          NaN        NaN                 NaN   
1  Afghanistan  2020-01-02          NaN        NaN                 NaN   
2  Afghanistan  2020-01-03          NaN        NaN                 NaN   
3  Afghanistan  2020-01-04          0.0        0.0                 NaN   
4  Afghanistan  2020-01-05          0.0        0.0                 NaN   
5  Afghanistan  2020-01-06          0.0        0.0                 NaN   
6  Afghanistan  2020-01-07          0.0        0.0                 NaN   
7  Afghanistan  2020-01-08          0.0        0.0                 NaN   
8  Afghanistan  2020-01-09          0.0        0.0                 0.0   
9  Afghanistan  2020-01-10          0.0        0.0                 0.0   

   total_cases_per_million  new_cases_per_million  \
0                      NaN                    NaN   
1                      NaN                    NaN   
2                      NaN

In [64]:
# Modifying Data
df.drop(['extreme_poverty','human_development_index'], axis=1, inplace=True) # inplace=true means changes will reflect in df variable itself
df.rename(columns={'date':'observation_date'}, inplace=True)
df['observation_date'] = pd.to_datetime(df['observation_date'])
print(df.head(10))

print(df.describe())
print(df.info())
print(df.fillna('NA'))

       country observation_date  total_cases  new_cases  new_cases_smoothed  \
0  Afghanistan       2020-01-01          NaN        NaN                 NaN   
1  Afghanistan       2020-01-02          NaN        NaN                 NaN   
2  Afghanistan       2020-01-03          NaN        NaN                 NaN   
3  Afghanistan       2020-01-04          0.0        0.0                 NaN   
4  Afghanistan       2020-01-05          0.0        0.0                 NaN   
5  Afghanistan       2020-01-06          0.0        0.0                 NaN   
6  Afghanistan       2020-01-07          0.0        0.0                 NaN   
7  Afghanistan       2020-01-08          0.0        0.0                 NaN   
8  Afghanistan       2020-01-09          0.0        0.0                 0.0   
9  Afghanistan       2020-01-10          0.0        0.0                 0.0   

   total_cases_per_million  new_cases_per_million  \
0                      NaN                    NaN   
1                      N

In [81]:
# Fetching Data

df2 = df.groupby('country')[['total_cases', 'new_cases', 'total_deaths']].sum().reset_index()
print(df2)

df2 = df.groupby(['country', 'observation_date'])[['total_cases', 'new_cases', 'total_deaths']].sum().reset_index()
print(df2)

df3 = df2[df2['total_cases']>100]
print(df3)

                                               country   total_cases  \
0                                          Afghanistan  3.373140e+08   
1                                               Africa  1.958007e+10   
2                                              Albania  4.864377e+08   
3                                              Algeria  4.270483e+08   
4                                       American Samoa  1.013716e+07   
..                                                 ...           ...   
257                  World excl. China and South Korea  8.793242e+11   
258  World excl. China, South Korea, Japan and Sing...  8.383532e+11   
259                                              Yemen  1.860946e+07   
260                                             Zambia  5.093094e+08   
261                                           Zimbabwe  3.794344e+08   

       new_cases  total_deaths  
0       235214.0  1.258689e+07  
1     13064968.0  4.062112e+08  
2       337202.0  5.719974e+06  
3  