In [1]:
import numpy as np
import pandas as pd

In [2]:
dict1 = {
    "name": ["rishi", "siya", "kishan", "prince", "rajeev", "prem"],
    "marks": [100, 99, 90, 80, 70, 60],
    "city": ["delhi", "delhi", "pune", "kolkata", "agra", "lucknow"] 
}

In [3]:
dict1

{'name': ['rishi', 'siya', 'kishan', 'prince', 'rajeev', 'prem'],
 'marks': [100, 99, 90, 80, 70, 60],
 'city': ['delhi', 'delhi', 'pune', 'kolkata', 'agra', 'lucknow']}

In [4]:
df = pd.DataFrame(dict1)

In [5]:
df

Unnamed: 0,name,marks,city
0,rishi,100,delhi
1,siya,99,delhi
2,kishan,90,pune
3,prince,80,kolkata
4,rajeev,70,agra
5,prem,60,lucknow


### To convert a dataframe into csv

In [6]:
df.to_csv("friends.csv")

In [7]:
df.to_csv("friends_without_index.csv", index = False)

### To see x rows from the start/end

In [8]:
df.tail(2)

Unnamed: 0,name,marks,city
4,rajeev,70,agra
5,prem,60,lucknow


In [9]:
df.head(2)

Unnamed: 0,name,marks,city
0,rishi,100,delhi
1,siya,99,delhi


### For a basic statistical analysis of the numeric columns in our data frame

In [10]:
df.describe()

Unnamed: 0,marks
count,6.0
mean,83.166667
std,16.129683
min,60.0
25%,72.5
50%,85.0
75%,96.75
max,100.0


In [11]:
# tells info about the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    6 non-null      object
 1   marks   6 non-null      int64 
 2   city    6 non-null      object
dtypes: int64(1), object(2)
memory usage: 276.0+ bytes


### Importing a CSV into pandas 

In [12]:
trains = pd.read_csv("demo_trains.csv")

In [13]:
trains

Unnamed: 0,Train No,Speed,City
0,12001,85,Delhi
1,12951,95,Mumbai
2,12309,110,Kolkata
3,12627,100,Chennai
4,12229,105,Bangalore
5,12801,90,Bhubaneswar
6,12423,115,Lucknow
7,12101,92,Nagpur
8,12791,98,Hyderabad
9,12953,108,Ahmedabad


### Note: We can change the values thorught direct assignment but it fails in some cases therefore don't use this 

In [14]:
trains['Speed'][0] = 92

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  trains['Speed'][0] = 92
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trains['Speed'][0] = 92


In [15]:
trains

Unnamed: 0,Train No,Speed,City
0,12001,92,Delhi
1,12951,95,Mumbai
2,12309,110,Kolkata
3,12627,100,Chennai
4,12229,105,Bangalore
5,12801,90,Bhubaneswar
6,12423,115,Lucknow
7,12101,92,Nagpur
8,12791,98,Hyderabad
9,12953,108,Ahmedabad


### Change the indexes but remember you have to wither change all or none

In [16]:
df.index = ['first', 'second','third', 'four', 'firth', 'sixth']

In [17]:
df

Unnamed: 0,name,marks,city
first,rishi,100,delhi
second,siya,99,delhi
third,kishan,90,pune
four,prince,80,kolkata
firth,rajeev,70,agra
sixth,prem,60,lucknow


# Basic data structures in pandas
### Pandas provides two types of classes for handling data:

1. **Series**: a one-dimensional labeled array holding data of any type such as integers, strings, Python objects etc.

2. **DataFrame**: a two-dimensional data structure that holds data like a two-dimension array or a table with rows and columns.

### Creating a Series

In [18]:
ser = pd.Series(np.random.rand(7))

In [19]:
ser, type(ser)

(0    0.358051
 1    0.618401
 2    0.529289
 3    0.638958
 4    0.956653
 5    0.486472
 6    0.717869
 dtype: float64,
 pandas.core.series.Series)

### Creating a DataFrame

In [20]:
newdf = pd.DataFrame(np.random.rand(200, 5), index=np.arange(200))

In [21]:
newdf

Unnamed: 0,0,1,2,3,4
0,0.312090,0.217850,0.101502,0.102326,0.066531
1,0.708739,0.004512,0.883500,0.883035,0.924135
2,0.963714,0.790764,0.316427,0.210430,0.228343
3,0.418559,0.756871,0.688686,0.809792,0.183615
4,0.260112,0.478380,0.553646,0.182656,0.706365
...,...,...,...,...,...
195,0.821864,0.137651,0.592004,0.392635,0.994038
196,0.427370,0.528469,0.612050,0.285042,0.526008
197,0.869591,0.699794,0.336113,0.789530,0.903483
198,0.166446,0.713110,0.633339,0.607685,0.345071


In [22]:
type(newdf)

pandas.core.frame.DataFrame

### To convert a dataframe into numpy arrays

In [23]:
df.to_numpy()

array([['rishi', 100, 'delhi'],
       ['siya', 99, 'delhi'],
       ['kishan', 90, 'pune'],
       ['prince', 80, 'kolkata'],
       ['rajeev', 70, 'agra'],
       ['prem', 60, 'lucknow']], dtype=object)

## Attributes of DataFrame

In [24]:
# for transpose
newdf.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.31209,0.708739,0.963714,0.418559,0.260112,0.497004,0.859949,0.139158,0.642259,0.449554,...,0.046472,0.971273,0.92287,0.440941,0.593597,0.821864,0.42737,0.869591,0.166446,0.682845
1,0.21785,0.004512,0.790764,0.756871,0.47838,0.705409,0.078439,0.099396,0.153009,0.468908,...,0.156813,0.015116,0.261262,0.262845,0.365518,0.137651,0.528469,0.699794,0.71311,0.242358
2,0.101502,0.8835,0.316427,0.688686,0.553646,0.246923,0.689425,0.621504,0.30463,0.60102,...,0.081523,0.753631,0.382595,0.862049,0.850712,0.592004,0.61205,0.336113,0.633339,0.322079
3,0.102326,0.883035,0.21043,0.809792,0.182656,0.375867,0.804394,0.062875,0.308802,0.698584,...,0.211675,0.008682,0.838435,0.615438,0.47449,0.392635,0.285042,0.78953,0.607685,0.647419
4,0.066531,0.924135,0.228343,0.183615,0.706365,0.035005,0.001443,0.36487,0.718044,0.141186,...,0.450273,0.129723,0.098011,0.595426,0.665663,0.994038,0.526008,0.903483,0.345071,0.930364


In [25]:
# sort based on index in 2D array axis 0 -> x, axis 1 -> y 
newdf.sort_index(axis=1, ascending=False)

Unnamed: 0,4,3,2,1,0
0,0.066531,0.102326,0.101502,0.217850,0.312090
1,0.924135,0.883035,0.883500,0.004512,0.708739
2,0.228343,0.210430,0.316427,0.790764,0.963714
3,0.183615,0.809792,0.688686,0.756871,0.418559
4,0.706365,0.182656,0.553646,0.478380,0.260112
...,...,...,...,...,...
195,0.994038,0.392635,0.592004,0.137651,0.821864
196,0.526008,0.285042,0.612050,0.528469,0.427370
197,0.903483,0.789530,0.336113,0.699794,0.869591
198,0.345071,0.607685,0.633339,0.713110,0.166446


In [26]:
newdf[0]

0      0.312090
1      0.708739
2      0.963714
3      0.418559
4      0.260112
         ...   
195    0.821864
196    0.427370
197    0.869591
198    0.166446
199    0.682845
Name: 0, Length: 200, dtype: float64

In [27]:
# If we change something in view then its also changes in the main dataframe to avoid this use .copy() or newdf[:] 
# and it this chaned access has other issues thats why we use .loc[row, col] = value to update
newdf2 = newdf

In [28]:
newdf2[0][0] = 10

In [29]:
newdf2.columns = list("ABCDE")

In [30]:
newdf2

Unnamed: 0,A,B,C,D,E
0,10.000000,0.217850,0.101502,0.102326,0.066531
1,0.708739,0.004512,0.883500,0.883035,0.924135
2,0.963714,0.790764,0.316427,0.210430,0.228343
3,0.418559,0.756871,0.688686,0.809792,0.183615
4,0.260112,0.478380,0.553646,0.182656,0.706365
...,...,...,...,...,...
195,0.821864,0.137651,0.592004,0.392635,0.994038
196,0.427370,0.528469,0.612050,0.285042,0.526008
197,0.869591,0.699794,0.336113,0.789530,0.903483
198,0.166446,0.713110,0.633339,0.607685,0.345071


In [31]:
newdf2.loc[0, 'A'] = 1224

In [32]:
newdf2

Unnamed: 0,A,B,C,D,E
0,1224.000000,0.217850,0.101502,0.102326,0.066531
1,0.708739,0.004512,0.883500,0.883035,0.924135
2,0.963714,0.790764,0.316427,0.210430,0.228343
3,0.418559,0.756871,0.688686,0.809792,0.183615
4,0.260112,0.478380,0.553646,0.182656,0.706365
...,...,...,...,...,...
195,0.821864,0.137651,0.592004,0.392635,0.994038
196,0.427370,0.528469,0.612050,0.285042,0.526008
197,0.869591,0.699794,0.336113,0.789530,0.903483
198,0.166446,0.713110,0.633339,0.607685,0.345071


In [33]:
# to get specific columns
newdf2.loc[:, ['A', 'C']]

Unnamed: 0,A,C
0,1224.000000,0.101502
1,0.708739,0.883500
2,0.963714,0.316427
3,0.418559,0.688686
4,0.260112,0.553646
...,...,...
195,0.821864,0.592004
196,0.427370,0.612050
197,0.869591,0.336113
198,0.166446,0.633339


In [34]:
# we can also run db like complex queries using .loc
# newdf2.loc[(newdf2['A'] > 0.9)]
newdf2.loc[(newdf2['A'] > 0.9) & (newdf2['C'] < 0.1)]

Unnamed: 0,A,B,C,D,E
56,0.96512,0.124921,0.044123,0.812822,0.220511
60,0.931773,0.779926,0.062355,0.453491,0.129347
116,0.970936,0.113815,0.075243,0.218945,0.789911


In [35]:
# if we just have to use indexing irrespective of the row and colums name use iloc
newdf.iloc[0,0]

np.float64(1224.0)

| Feature         | `.loc[]`         | `.iloc[]`          | `.at[]`           | `.iat[]`          |
| --------------- | ---------------- | ------------------ | ----------------- | ----------------- |
| Access by       | Label            | Integer Position   | Label             | Integer Position  |
| Returns         | Series/DataFrame | Series/DataFrame   | Scalar            | Scalar            |
| Slice inclusive | Yes              | No                 | No                | No                |
| Best for        | General access   | Index-based access | Fast single value | Fast single value |


In [36]:
# if we have to drop a row/column - be default axis is row
# Note: these thing doesn't change the original dataframe in this case newdf2 
newdf2.drop(1) # for row
newdf2.drop(['A', 'B'], axis=1) # for col

Unnamed: 0,C,D,E
0,0.101502,0.102326,0.066531
1,0.883500,0.883035,0.924135
2,0.316427,0.210430,0.228343
3,0.688686,0.809792,0.183615
4,0.553646,0.182656,0.706365
...,...,...,...
195,0.592004,0.392635,0.994038
196,0.612050,0.285042,0.526008
197,0.336113,0.789530,0.903483
198,0.633339,0.607685,0.345071


In [37]:
# Some functions have an inplace attribute that manipulates the original dataframe insted of returing a copy 
newdf2.drop(['C', 'D'], axis=1, inplace=True)

In [38]:
newdf2.head()

Unnamed: 0,A,B,E
0,1224.0,0.21785,0.066531
1,0.708739,0.004512,0.924135
2,0.963714,0.790764,0.228343
3,0.418559,0.756871,0.183615
4,0.260112,0.47838,0.706365


In [39]:
newdf2.drop([1, 3, 4, 5], inplace=True)

In [40]:
newdf2.head()

Unnamed: 0,A,B,E
0,1224.0,0.21785,0.066531
2,0.963714,0.790764,0.228343
6,0.859949,0.078439,0.001443
7,0.139158,0.099396,0.36487
8,0.642259,0.153009,0.718044


In [41]:
# now the row indexing is not ordered to sovle this we can again reset the index but .reset_index will add a extra column
# at the start called index to remove this we set drop=True to drop the indexes column but retain the index
newdf2.reset_index()

Unnamed: 0,index,A,B,E
0,0,1224.000000,0.217850,0.066531
1,2,0.963714,0.790764,0.228343
2,6,0.859949,0.078439,0.001443
3,7,0.139158,0.099396,0.364870
4,8,0.642259,0.153009,0.718044
...,...,...,...,...
191,195,0.821864,0.137651,0.994038
192,196,0.427370,0.528469,0.526008
193,197,0.869591,0.699794,0.903483
194,198,0.166446,0.713110,0.345071


In [42]:
newdf2.reset_index(drop=True, inplace=True)

In [43]:
# isnull telles whether something is null or not 
newdf2.loc[1:5, ['B']] = None
newdf2.isnull()

Unnamed: 0,A,B,E
0,False,False,False
1,False,True,False
2,False,True,False
3,False,True,False
4,False,True,False
...,...,...,...
191,False,False,False
192,False,False,False
193,False,False,False
194,False,False,False


In [44]:
# notnull telles whether something is !null or not 
newdf2.loc[1:5, ['B']] = None
newdf2.head().notnull()

Unnamed: 0,A,B,E
0,True,True,True
1,True,False,True
2,True,False,True
3,True,False,True
4,True,False,True


In [45]:
# quick recap question - create df[3, 2] and run some methods 
q = pd.DataFrame(np.random.randint(0, 100, size=(3, 2)), columns=list("AB"))
q

Unnamed: 0,A,B
0,30,66
1,52,31
2,28,78


In [46]:
q.describe()

Unnamed: 0,A,B
count,3.0,3.0
mean,36.666667,58.333333
std,13.316656,24.419937
min,28.0,31.0
25%,29.0,48.5
50%,30.0,66.0
75%,41.0,72.0
max,52.0,78.0


In [47]:
q.mean()

A    36.666667
B    58.333333
dtype: float64

In [48]:
q.median()

A    30.0
B    66.0
dtype: float64

In [49]:
# used to calculate the s.d 
q.std()

A    13.316656
B    24.419937
dtype: float64

In [50]:
# gives correlation matrix that contains corr between i and j
q.corr()

Unnamed: 0,A,B
A,1.0,-0.985059
B,-0.985059,1.0


In [51]:
# count the number of non-null (non-NaN) values in each column or row of a DataFrame or Series.
q.count()

A    3
B    3
dtype: int64

In [52]:
q.max(),q.min()

(A    52
 B    78
 dtype: int64,
 A    28
 B    31
 dtype: int64)