# Pandas

- pandas is a open source data analysis lib written in python
- provides power and speed of numpy to make data analysis and preprocessing easy for data science
- provides rich and highly robust data operation

### imports

In [2]:
import numpy as np
import pandas as pd

### making a dataframe 

In [3]:
dict1 = {
    'name':['harry','ram','rohan','hari'],
    'marks':[92,34,24,17],
    'city':['rampur','kolkata','kathmandu','berlin']
}

In [4]:
df = pd.DataFrame(dict1) #dataframe gives the dict data in a dataframe format

In [5]:
df

Unnamed: 0,name,marks,city
0,harry,92,rampur
1,ram,34,kolkata
2,rohan,24,kathmandu
3,hari,17,berlin


## exporting to excell sheet

In [6]:
df.to_csv('dic1.csv')

In [7]:
df.to_csv('dict1.csv',index=False) #removes the index

### preveiwing

In [8]:
df.head(2) #show only first 2 rows 

Unnamed: 0,name,marks,city
0,harry,92,rampur
1,ram,34,kolkata


In [9]:
df.tail(2) #shows last 2 rows

Unnamed: 0,name,marks,city
2,rohan,24,kathmandu
3,hari,17,berlin


In [10]:
df.describe() #descriptions such as count max min mean ...

Unnamed: 0,marks
count,4.0
mean,41.75
std,34.21866
min,17.0
25%,22.25
50%,29.0
75%,48.5
max,92.0


### opening a csv file 

In [11]:
dataset = pd.read_csv('university-wise-student-enrollment-of-higher-education-by-sex-in-2074-bs.csv')

In [12]:
dataset.head(10)

Unnamed: 0,University,Male,Female,Total
0,Tribhuvan University,155336,129117,284453
1,Nepal Sanskrit University,250,1221,1471
2,Kathmandu University,8001,8657,16658
3,Purbanchal University,10657,12882,23539
4,Pokhara University,10228,15804,26032
5,Lumbini Bauddha University,57,139,196
6,Agriculture and Forestry University,487,1096,1583
7,Mid-Western University,1284,1762,3046
8,Far Western University,1107,1104,2211
9,BP Koirala Institute for Health Sciences,704,744,1448


### updating values 

In [13]:
dataset['University'][4] = 'PU'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['University'][4] = 'PU'


In [14]:
dataset['University'][4]

'PU'

In [15]:
dataset['University'][4] = 'Pokhara University'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['University'][4] = 'Pokhara University'


In [16]:
dataset['University'][4]

'Pokhara University'

In [17]:
dataset.describe()

Unnamed: 0,Male,Female,Total
count,16.0,16.0,16.0
mean,23536.875,21597.75,45134.625
std,58313.193489,51354.777747,109613.350693
min,0.0,0.0,0.0
25%,47.5,122.0,169.5
50%,595.5,1100.0,1527.0
75%,8557.75,9713.25,18378.25
max,188295.0,172782.0,361077.0


## Pandas Data Structure

- **Series** &rarr; 1D array with indexes , it stores single column or row of data
- **DataFrame** &rarr; tabular spreadsheet containing multiple rows and columns 

![IMG_20230201_100352_edit_2184189659402650.jpg](attachment:IMG_20230201_100352_edit_2184189659402650.jpg)

### creating a series

In [18]:
s = pd.Series(np.random.rand(30)) #generate an numpy array of random float of size 30  in pandas series

In [19]:
s

0     0.916764
1     0.761812
2     0.877888
3     0.791279
4     0.317608
5     0.729091
6     0.874889
7     0.551798
8     0.257083
9     0.713800
10    0.016564
11    0.564182
12    0.527017
13    0.029598
14    0.560393
15    0.687117
16    0.259724
17    0.430775
18    0.041869
19    0.192028
20    0.738337
21    0.714921
22    0.235994
23    0.770206
24    0.007105
25    0.157499
26    0.029165
27    0.555204
28    0.879750
29    0.382380
dtype: float64

In [20]:
type(s) # data type is pandas series

pandas.core.series.Series

### creating a dataframe

random numpy matrix of ```334 X 5``` size and index starting from 1

In [21]:
newDf = pd.DataFrame(np.random.rand(334,5), index=np.arange(1,335))

In [22]:
newDf.head(5)

Unnamed: 0,0,1,2,3,4
1,0.173153,0.668869,0.55056,0.392664,0.429106
2,0.728928,0.858067,0.343773,0.836574,0.391632
3,0.265069,0.981319,0.365959,0.966157,0.626696
4,0.741299,0.716635,0.791707,0.244812,0.54132
5,0.243107,0.162554,0.6723,0.431907,0.296605


In [23]:
type(newDf) # datatype is dataframe

pandas.core.frame.DataFrame

In [24]:
newDf.dtypes

0    float64
1    float64
2    float64
3    float64
4    float64
dtype: object

In [25]:
newDf[0][1] = 'test'

In [26]:
newDf.dtypes

0     object
1    float64
2    float64
3    float64
4    float64
dtype: object

In [27]:
newDf.head()

Unnamed: 0,0,1,2,3,4
1,test,0.668869,0.55056,0.392664,0.429106
2,0.728928,0.858067,0.343773,0.836574,0.391632
3,0.265069,0.981319,0.365959,0.966157,0.626696
4,0.741299,0.716635,0.791707,0.244812,0.54132
5,0.243107,0.162554,0.6723,0.431907,0.296605


In [28]:
newDf.index # lists all the index

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            325, 326, 327, 328, 329, 330, 331, 332, 333, 334],
           dtype='int64', length=334)

In [29]:
newDf.columns

RangeIndex(start=0, stop=5, step=1)

### converting a dataframe to numpy array

In [30]:
newDf.to_numpy()

array([['test', 0.66886865137182, 0.550560304955775, 0.39266387527274704,
        0.4291062388281751],
       [0.7289282616777374, 0.8580670377377111, 0.34377256268300105,
        0.8365736975949123, 0.3916319373583166],
       [0.26506887069455065, 0.981319106522999, 0.36595925011229036,
        0.9661573043257062, 0.6266958379380062],
       ...,
       [0.4726109441228741, 0.4989310395283796, 0.5577920380549093,
        0.8373749686310383, 0.7775091152781943],
       [0.6197268458633255, 0.4616333268029722, 0.9562094566607492,
        0.5406504222477541, 0.5675503445507135],
       [0.9437682647946201, 0.642335344847642, 0.6941401550649176,
        0.14213676291499266, 0.08759032790564636]], dtype=object)

In [31]:
newDf.T # transposes the data types

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,325,326,327,328,329,330,331,332,333,334
0,test,0.728928,0.265069,0.741299,0.243107,0.470949,0.589847,0.739633,0.87905,0.993784,...,0.175778,0.273052,0.236164,0.756907,0.033856,0.61127,0.453003,0.472611,0.619727,0.943768
1,0.668869,0.858067,0.981319,0.716635,0.162554,0.599484,0.482395,0.647122,0.882414,0.365112,...,0.477598,0.423756,0.058127,0.343332,0.193461,0.597161,0.940075,0.498931,0.461633,0.642335
2,0.55056,0.343773,0.365959,0.791707,0.6723,0.653227,0.758504,0.316379,0.410339,0.325062,...,0.522051,0.792961,0.637746,0.929823,0.496328,0.444437,0.16364,0.557792,0.956209,0.69414
3,0.392664,0.836574,0.966157,0.244812,0.431907,0.584692,0.182121,0.675386,0.005036,0.615532,...,0.333769,0.217279,0.975086,0.464191,0.141595,0.128546,0.993653,0.837375,0.54065,0.142137
4,0.429106,0.391632,0.626696,0.54132,0.296605,0.742148,0.92083,0.888248,0.335636,0.957346,...,0.815508,0.262957,0.417578,0.285947,0.609427,0.543568,0.669081,0.777509,0.56755,0.08759


In [32]:
newDf.head(3)

Unnamed: 0,0,1,2,3,4
1,test,0.668869,0.55056,0.392664,0.429106
2,0.728928,0.858067,0.343773,0.836574,0.391632
3,0.265069,0.981319,0.365959,0.966157,0.626696


### sorting index

In [33]:
newDf.sort_index(axis =0, ascending= False) # rows

Unnamed: 0,0,1,2,3,4
334,0.943768,0.642335,0.694140,0.142137,0.087590
333,0.619727,0.461633,0.956209,0.540650,0.567550
332,0.472611,0.498931,0.557792,0.837375,0.777509
331,0.453003,0.940075,0.163640,0.993653,0.669081
330,0.61127,0.597161,0.444437,0.128546,0.543568
...,...,...,...,...,...
5,0.243107,0.162554,0.672300,0.431907,0.296605
4,0.741299,0.716635,0.791707,0.244812,0.541320
3,0.265069,0.981319,0.365959,0.966157,0.626696
2,0.728928,0.858067,0.343773,0.836574,0.391632


In [34]:
newDf.sort_index(axis = 1, ascending= False) # column

Unnamed: 0,4,3,2,1,0
1,0.429106,0.392664,0.550560,0.668869,test
2,0.391632,0.836574,0.343773,0.858067,0.728928
3,0.626696,0.966157,0.365959,0.981319,0.265069
4,0.541320,0.244812,0.791707,0.716635,0.741299
5,0.296605,0.431907,0.672300,0.162554,0.243107
...,...,...,...,...,...
330,0.543568,0.128546,0.444437,0.597161,0.61127
331,0.669081,0.993653,0.163640,0.940075,0.453003
332,0.777509,0.837375,0.557792,0.498931,0.472611
333,0.567550,0.540650,0.956209,0.461633,0.619727


In [35]:
newDf[0]

1          test
2      0.728928
3      0.265069
4      0.741299
5      0.243107
         ...   
330     0.61127
331    0.453003
332    0.472611
333    0.619727
334    0.943768
Name: 0, Length: 334, dtype: object

### views vs copy

In [36]:
newDf_2 = newDf

In [37]:
newDf_2[0][1] = 'hello'

**the value is changed from ```test``` to ```hello``` on both ```newDf``` and ```newDF_2``` because ```newDf_2``` is pointing to ```newDf```**

In [38]:
newDf

Unnamed: 0,0,1,2,3,4
1,hello,0.668869,0.550560,0.392664,0.429106
2,0.728928,0.858067,0.343773,0.836574,0.391632
3,0.265069,0.981319,0.365959,0.966157,0.626696
4,0.741299,0.716635,0.791707,0.244812,0.541320
5,0.243107,0.162554,0.672300,0.431907,0.296605
...,...,...,...,...,...
330,0.61127,0.597161,0.444437,0.128546,0.543568
331,0.453003,0.940075,0.163640,0.993653,0.669081
332,0.472611,0.498931,0.557792,0.837375,0.777509
333,0.619727,0.461633,0.956209,0.540650,0.567550


In [39]:
newDf_2

Unnamed: 0,0,1,2,3,4
1,hello,0.668869,0.550560,0.392664,0.429106
2,0.728928,0.858067,0.343773,0.836574,0.391632
3,0.265069,0.981319,0.365959,0.966157,0.626696
4,0.741299,0.716635,0.791707,0.244812,0.541320
5,0.243107,0.162554,0.672300,0.431907,0.296605
...,...,...,...,...,...
330,0.61127,0.597161,0.444437,0.128546,0.543568
331,0.453003,0.940075,0.163640,0.993653,0.669081
332,0.472611,0.498931,0.557792,0.837375,0.777509
333,0.619727,0.461633,0.956209,0.540650,0.567550


**Thus to create a copy we must use the ```copy()``` function**

In [40]:
newDf_2 = newDf.copy()

In [41]:
newDf_2[0][1] = 'Not Hello'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newDf_2[0][1] = 'Not Hello'


In [42]:
newDf

Unnamed: 0,0,1,2,3,4
1,hello,0.668869,0.550560,0.392664,0.429106
2,0.728928,0.858067,0.343773,0.836574,0.391632
3,0.265069,0.981319,0.365959,0.966157,0.626696
4,0.741299,0.716635,0.791707,0.244812,0.541320
5,0.243107,0.162554,0.672300,0.431907,0.296605
...,...,...,...,...,...
330,0.61127,0.597161,0.444437,0.128546,0.543568
331,0.453003,0.940075,0.163640,0.993653,0.669081
332,0.472611,0.498931,0.557792,0.837375,0.777509
333,0.619727,0.461633,0.956209,0.540650,0.567550


In [43]:
newDf_2

Unnamed: 0,0,1,2,3,4
1,Not Hello,0.668869,0.550560,0.392664,0.429106
2,0.728928,0.858067,0.343773,0.836574,0.391632
3,0.265069,0.981319,0.365959,0.966157,0.626696
4,0.741299,0.716635,0.791707,0.244812,0.541320
5,0.243107,0.162554,0.672300,0.431907,0.296605
...,...,...,...,...,...
330,0.61127,0.597161,0.444437,0.128546,0.543568
331,0.453003,0.940075,0.163640,0.993653,0.669081
332,0.472611,0.498931,0.557792,0.837375,0.777509
333,0.619727,0.461633,0.956209,0.540650,0.567550


### SettingWithCopyWarning fix

assigning values to a index in a dataframe we get the error as above ie. SettingWithCopyWarning this is because we are not telling pandas to assign the value with copy or view. Thus pandas decides for itself.

#### ```loc()``` function

In [44]:
newDf_2.loc[1,0] = 'fixed' # changing at index 1,0

In [45]:
newDf_2.head(3)

Unnamed: 0,0,1,2,3,4
1,fixed,0.668869,0.55056,0.392664,0.429106
2,0.728928,0.858067,0.343773,0.836574,0.391632
3,0.265069,0.981319,0.365959,0.966157,0.626696


#### changing index and columns

In [47]:
newDf_2.index = np.arange(0,334)

In [48]:
newDf_2.columns = list("ABCDE") # set columns as list of A to E

In [49]:
newDf_2.head(3)

Unnamed: 0,A,B,C,D,E
0,fixed,0.668869,0.55056,0.392664,0.429106
1,0.728928,0.858067,0.343773,0.836574,0.391632
2,0.265069,0.981319,0.365959,0.966157,0.626696


In [50]:
newDf_2.loc[0,'A'] = 0.123455

In [51]:
newDf_2

Unnamed: 0,A,B,C,D,E
0,0.123455,0.668869,0.550560,0.392664,0.429106
1,0.728928,0.858067,0.343773,0.836574,0.391632
2,0.265069,0.981319,0.365959,0.966157,0.626696
3,0.741299,0.716635,0.791707,0.244812,0.541320
4,0.243107,0.162554,0.672300,0.431907,0.296605
...,...,...,...,...,...
329,0.61127,0.597161,0.444437,0.128546,0.543568
330,0.453003,0.940075,0.163640,0.993653,0.669081
331,0.472611,0.498931,0.557792,0.837375,0.777509
332,0.619727,0.461633,0.956209,0.540650,0.567550


#### creating/deteting a new column/row 

In [54]:
newDf_2.loc[:,'F'] = np.random.rand(334) # F does not exist as col so creates a F and assigns the value

In [55]:
newDf_2

Unnamed: 0,A,B,C,D,E,F
0,0.123455,0.668869,0.550560,0.392664,0.429106,0.585066
1,0.728928,0.858067,0.343773,0.836574,0.391632,0.996938
2,0.265069,0.981319,0.365959,0.966157,0.626696,0.363004
3,0.741299,0.716635,0.791707,0.244812,0.541320,0.468465
4,0.243107,0.162554,0.672300,0.431907,0.296605,0.752823
...,...,...,...,...,...,...
329,0.61127,0.597161,0.444437,0.128546,0.543568,0.235681
330,0.453003,0.940075,0.163640,0.993653,0.669081,0.595294
331,0.472611,0.498931,0.557792,0.837375,0.777509,0.219965
332,0.619727,0.461633,0.956209,0.540650,0.567550,0.585787


**```drop()```**

In [56]:
newDf_2.drop('F',axis = 1)

Unnamed: 0,A,B,C,D,E
0,0.123455,0.668869,0.550560,0.392664,0.429106
1,0.728928,0.858067,0.343773,0.836574,0.391632
2,0.265069,0.981319,0.365959,0.966157,0.626696
3,0.741299,0.716635,0.791707,0.244812,0.541320
4,0.243107,0.162554,0.672300,0.431907,0.296605
...,...,...,...,...,...
329,0.61127,0.597161,0.444437,0.128546,0.543568
330,0.453003,0.940075,0.163640,0.993653,0.669081
331,0.472611,0.498931,0.557792,0.837375,0.777509
332,0.619727,0.461633,0.956209,0.540650,0.567550


In [58]:
newDf_2.drop(333,axis = 0)

Unnamed: 0,A,B,C,D,E,F
0,0.123455,0.668869,0.550560,0.392664,0.429106,0.585066
1,0.728928,0.858067,0.343773,0.836574,0.391632,0.996938
2,0.265069,0.981319,0.365959,0.966157,0.626696,0.363004
3,0.741299,0.716635,0.791707,0.244812,0.541320,0.468465
4,0.243107,0.162554,0.672300,0.431907,0.296605,0.752823
...,...,...,...,...,...,...
328,0.033856,0.193461,0.496328,0.141595,0.609427,0.815494
329,0.61127,0.597161,0.444437,0.128546,0.543568,0.235681
330,0.453003,0.940075,0.163640,0.993653,0.669081,0.595294
331,0.472611,0.498931,0.557792,0.837375,0.777509,0.219965


### taking a small dataframe from dataframe

In [59]:
newDf_3 = newDf_2.loc[[1,3], ['B','C','D']].copy() # rows 1 to 3 and columns B C and D

In [60]:
newDf_3

Unnamed: 0,B,C,D
1,0.858067,0.343773,0.836574
3,0.716635,0.791707,0.244812


**Problem :** list all the rows of A with no greater than 0.5

In [61]:
newDf_2.loc[ (newDf_2['A'] > 0.5) ]

Unnamed: 0,A,B,C,D,E,F
1,0.728928,0.858067,0.343773,0.836574,0.391632,0.996938
3,0.741299,0.716635,0.791707,0.244812,0.541320,0.468465
6,0.589847,0.482395,0.758504,0.182121,0.920830,0.114474
7,0.739633,0.647122,0.316379,0.675386,0.888248,0.541619
8,0.87905,0.882414,0.410339,0.005036,0.335636,0.347043
...,...,...,...,...,...,...
323,0.781255,0.060188,0.302072,0.714028,0.006019,0.459232
327,0.756907,0.343332,0.929823,0.464191,0.285947,0.411258
329,0.61127,0.597161,0.444437,0.128546,0.543568,0.235681
332,0.619727,0.461633,0.956209,0.540650,0.567550,0.585787


In [62]:
newDf_2.loc[ (newDf_2['A'] > 0.5) & (newDf_2['C'] < 0.8)]

Unnamed: 0,A,B,C,D,E,F
1,0.728928,0.858067,0.343773,0.836574,0.391632,0.996938
3,0.741299,0.716635,0.791707,0.244812,0.541320,0.468465
6,0.589847,0.482395,0.758504,0.182121,0.920830,0.114474
7,0.739633,0.647122,0.316379,0.675386,0.888248,0.541619
8,0.87905,0.882414,0.410339,0.005036,0.335636,0.347043
...,...,...,...,...,...,...
310,0.823729,0.550158,0.787652,0.271829,0.874305,0.204317
322,0.864787,0.841584,0.376501,0.923747,0.711084,0.717154
323,0.781255,0.060188,0.302072,0.714028,0.006019,0.459232
329,0.61127,0.597161,0.444437,0.128546,0.543568,0.235681


#### ascessing by position

In [63]:
newDf_2.head(3)

Unnamed: 0,A,B,C,D,E,F
0,0.123455,0.668869,0.55056,0.392664,0.429106,0.585066
1,0.728928,0.858067,0.343773,0.836574,0.391632,0.996938
2,0.265069,0.981319,0.365959,0.966157,0.626696,0.363004


#### ```iloc()```

In [64]:
newDf_2.iloc[1,3] # counts row from 0-1 and counts colmn from 0-3

0.8365736975949123

to find row and column then ```loc()``` 


to find value only then ```iloc()```

#### reseting the index

In [65]:
newDf_2.head(3)

Unnamed: 0,A,B,C,D,E,F
0,0.123455,0.668869,0.55056,0.392664,0.429106,0.585066
1,0.728928,0.858067,0.343773,0.836574,0.391632,0.996938
2,0.265069,0.981319,0.365959,0.966157,0.626696,0.363004


In [67]:
newDf.reset_index() #resets the index but adds a index column 

Unnamed: 0,index,0,1,2,3,4
0,1,hello,0.668869,0.550560,0.392664,0.429106
1,2,0.728928,0.858067,0.343773,0.836574,0.391632
2,3,0.265069,0.981319,0.365959,0.966157,0.626696
3,4,0.741299,0.716635,0.791707,0.244812,0.541320
4,5,0.243107,0.162554,0.672300,0.431907,0.296605
...,...,...,...,...,...,...
329,330,0.61127,0.597161,0.444437,0.128546,0.543568
330,331,0.453003,0.940075,0.163640,0.993653,0.669081
331,332,0.472611,0.498931,0.557792,0.837375,0.777509
332,333,0.619727,0.461633,0.956209,0.540650,0.567550


In [70]:
newDf.reset_index(drop = True,inplace = True) # drops the index

#### ```inplace()``` function changes the orignal data

#### ```isnull()``` function 

In [74]:
newDf[0].isnull() # returns null values as true

0      False
1      False
2      False
3      False
4      False
       ...  
329    False
330    False
331    False
332    False
333    False
Name: 0, Length: 334, dtype: bool

In [None]:
newDf.loc[:,0] = None

In [81]:
newDf.head(3)

Unnamed: 0,0,1,2,3,4
0,,,,,
1,,0.858067,0.343773,0.836574,0.391632
2,,0.981319,0.365959,0.966157,0.626696


In [80]:
newDf[0].isnull()

0      True
1      True
2      True
3      True
4      True
       ... 
329    True
330    True
331    True
332    True
333    True
Name: 0, Length: 334, dtype: bool

In [83]:
newDf.dropna?

![image.png](attachment:image.png)

### removing entire null columns

In [86]:
newDf.dropna(how = 'all', axis = 1)

Unnamed: 0,1,2,3,4
0,,,,
1,0.858067,0.343773,0.836574,0.391632
2,0.981319,0.365959,0.966157,0.626696
3,0.716635,0.791707,0.244812,0.541320
4,0.162554,0.672300,0.431907,0.296605
...,...,...,...,...
329,0.597161,0.444437,0.128546,0.543568
330,0.940075,0.163640,0.993653,0.669081
331,0.498931,0.557792,0.837375,0.777509
332,0.461633,0.956209,0.540650,0.567550


### removing duplicates

In [87]:
newDf.drop_duplicates?

![image.png](attachment:image.png)

In [90]:
newDf.drop_duplicates(subset = [0]) #removes duplicates from 0

Unnamed: 0,0,1,2,3,4
0,,,,,


In [91]:
newDf.drop_duplicates(keep = 'last') # keeps last 

Unnamed: 0,0,1,2,3,4
0,,,,,
1,,0.858067,0.343773,0.836574,0.391632
2,,0.981319,0.365959,0.966157,0.626696
3,,0.716635,0.791707,0.244812,0.541320
4,,0.162554,0.672300,0.431907,0.296605
...,...,...,...,...,...
329,,0.597161,0.444437,0.128546,0.543568
330,,0.940075,0.163640,0.993653,0.669081
331,,0.498931,0.557792,0.837375,0.777509
332,,0.461633,0.956209,0.540650,0.567550


In [94]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    4 non-null      object
 1   marks   4 non-null      int64 
 2   city    4 non-null      object
dtypes: int64(1), object(2)
memory usage: 224.0+ bytes


#### count unique values

In [97]:
df['name'].value_counts(dropna = False) # dont count/remove NaN

harry    1
ram      1
rohan    1
hari     1
Name: name, dtype: int64

In [98]:
df.notnull()

Unnamed: 0,name,marks,city
0,True,True,True
1,True,True,True
2,True,True,True
3,True,True,True


## some staticts functions

In [104]:
# creating a int dataframe
intdf = pd.DataFrame(
    np.random.randint(10,50,[3,3])
)

In [105]:
intdf

Unnamed: 0,0,1,2
0,29,13,33
1,38,39,36
2,23,18,30


In [111]:
print('Min = ',intdf.min())

Min =  0    23
1    13
2    30
dtype: int32


In [112]:
print('Max = ',intdf.max())

Max =  0    38
1    39
2    36
dtype: int32


In [113]:
print('Mean = ',intdf.mean())

Mean =  0    30.000000
1    23.333333
2    33.000000
dtype: float64


In [114]:
print('Median = ',intdf.median())

Median =  0    29.0
1    18.0
2    33.0
dtype: float64


In [115]:
print('Count= ',intdf.count())

Count=  0    3
1    3
2    3
dtype: int64


In [117]:
print('Correlation = ',intdf.corr())

Correlation =            0         1         2
0  1.000000  0.830465  0.993399
1  0.830465  1.000000  0.761083
2  0.993399  0.761083  1.000000


In [118]:
print('Standard deviation = ',intdf.std())

Standard deviation =  0     7.549834
1    13.796135
2     3.000000
dtype: float64


### read Excel sheets

In [119]:
demographicData = pd.read_excel('demographic_of_nepal.xlsx')

In [120]:
demographicData.head(4)

Unnamed: 0,Most populous caste/ethnic groups (Census 2011)[13][14],Population,% of total
0,Most populous caste/ethnic groups (Census 2011),Population,% of total
1,Chhetri/Khas,4398053,16.6%
2,Bahun,3226903,12.2%
3,Magar,1877733,7.3%


In [127]:
demographicData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 3 columns):
 #   Column                                                   Non-Null Count  Dtype 
---  ------                                                   --------------  ----- 
 0   Most populous caste/ethnic groups (Census 2011)[13][14]  28 non-null     object
 1   Population                                               28 non-null     object
 2   % of total                                               28 non-null     object
dtypes: object(3)
memory usage: 800.0+ bytes


### ```rename()``` function

In [132]:
df.rename?

![image.png](attachment:image.png)

In [130]:
demographicData.rename(columns ={'Most populous caste/ethnic groups (Census 2011)[13][14]':'Caste/Ethnic groups'},inplace = True)

In [131]:
demographicData.head(4)

Unnamed: 0,Caste/Ethnic groups,Population,% of total
0,Most populous caste/ethnic groups (Census 2011),Population,% of total
1,Chhetri/Khas,4398053,16.6%
2,Bahun,3226903,12.2%
3,Magar,1877733,7.3%


the first index 0 is worthless so dropping it

In [135]:
demographicData.drop(index = 0,inplace = True)

In [137]:
demographicData

Unnamed: 0,Caste/Ethnic groups,Population,% of total
1,Chhetri/Khas,4398053,16.6%
2,Bahun,3226903,12.2%
3,Magar,1877733,7.3%
4,Tharu,1737470,6.6%
5,Tamang,1321933,6.5%
6,Newar,1539830,5.9%
7,Sanyasi/Dasnami,1287633,4.8%
8,Khas-Kami,1258554,4.7%
9,Muslim (taken as a single religious group),1164255,4.4%
10,Yadav,1054458,4.0%


In [149]:
demographicData.dtypes['Population'] # gives data types 

dtype('O')

In [160]:
demographicData['Population']

1     4,398,053
2     3,226,903
3     1,877,733
4     1,737,470
5     1,321,933
6     1,539,830
7     1,287,633
8     1,258,554
9     1,164,255
10    1,054,458
11      690,989
12      522,641
13      512,926
14      425,623
15      387,300
16      374,816
17      369,688
18      335,893
19      306,393
20      234,490
21      231,129
22      219,808
23      208,910
24      112,946
25      100,000
26      121,196
27    4,229,290
Name: Population, dtype: object

changing the object into number as 

In [164]:
pd.to_numeric(demographicData['Population'])

ValueError: Unable to parse string "4,398,053" at position 0

str at index $[0,0]$  has commas so we need to remove it by replacing it as

In [169]:
demographicData['Population'] = pd.to_numeric(demographicData['Population'].str.replace(',',''))

similarly for percentage

In [171]:
demographicData['% of total']

1      16.6%
2      12.2%
3       7.3%
4       6.6%
5       6.5%
6       5.9%
7       4.8%
8       4.7%
9       4.4%
10      4.0%
11      2.3%
12      1.9%
13      1.8%
14      1.6%
15      1.4%
16     1.41%
17      1.4%
18      1.3%
19      1.1%
20     0.88%
21     0.87%
22     0.82%
23     0.79%
24     0.42%
25     0.38%
26     0.46%
27    15.96%
Name: % of total, dtype: object

In [173]:
# removing %
demographicData['% of total'] = demographicData['% of total'].str.replace('%','')

In [176]:
# converting to numeric
demographicData['% of total'] = pd.to_numeric(demographicData['% of total'])

In [177]:
demographicData['% of total']

1     16.60
2     12.20
3      7.30
4      6.60
5      6.50
6      5.90
7      4.80
8      4.70
9      4.40
10     4.00
11     2.30
12     1.90
13     1.80
14     1.60
15     1.40
16     1.41
17     1.40
18     1.30
19     1.10
20     0.88
21     0.87
22     0.82
23     0.79
24     0.42
25     0.38
26     0.46
27    15.96
Name: % of total, dtype: float64

In [178]:
#  it is % so divide it by 100
demographicData['% of total'] = (demographicData['% of total']/100)

In [179]:
demographicData['% of total']

1     0.1660
2     0.1220
3     0.0730
4     0.0660
5     0.0650
6     0.0590
7     0.0480
8     0.0470
9     0.0440
10    0.0400
11    0.0230
12    0.0190
13    0.0180
14    0.0160
15    0.0140
16    0.0141
17    0.0140
18    0.0130
19    0.0110
20    0.0088
21    0.0087
22    0.0082
23    0.0079
24    0.0042
25    0.0038
26    0.0046
27    0.1596
Name: % of total, dtype: float64

### write the changes to excel sheet

In [181]:
demographicData.to_excel('demographicDataNepal.xlsx')