# Pandas

### Important points to use Pandas

#### *Open Source Data Analysis Library*
#### *Uses numpy for data processing, thus fetures of numpy are also used*
#### *Provides rich, high, robust, huge amount data processing*

##### Series - 1d array capable of holding any type of data (column of a dataframe)
##### Dataframe - 2d array with rows and columns

In [1]:
# installing pandas
!pip install pandas



In [2]:
# importing libraries
import numpy as np
import pandas as pd

In [3]:
# creating a random dictionary
dict1 = {
    "Name": ["Lorem", "Ipsum", "Dolor", "Sit Amet", "Voluptatem"],
    "Marks": [81, 92, 85, 87, 96],
    "City": ["Sollicitudin", "Dignissim", "Suspendisse", "Pellentesque", "Efficitur"]
}

In [4]:
# converting dictionary to dataframe
df = pd.DataFrame(dict1)

In [5]:
df

Unnamed: 0,Name,Marks,City
0,Lorem,81,Sollicitudin
1,Ipsum,92,Dignissim
2,Dolor,85,Suspendisse
3,Sit Amet,87,Pellentesque
4,Voluptatem,96,Efficitur


In [6]:
# saving the data as csv
df.to_csv("data1.csv")
# df.to_csv("data1.csv", index=false)                    # for no indexing

In [7]:
# displaying first 2 rows
df.head(2)

Unnamed: 0,Name,Marks,City
0,Lorem,81,Sollicitudin
1,Ipsum,92,Dignissim


In [8]:
# displaying last 2 rows
df.tail(2)

Unnamed: 0,Name,Marks,City
3,Sit Amet,87,Pellentesque
4,Voluptatem,96,Efficitur


In [9]:
# performs statistical analysis
df.describe()

Unnamed: 0,Marks
count,5.0
mean,88.2
std,5.890671
min,81.0
25%,85.0
50%,87.0
75%,92.0
max,96.0


In [10]:
df.index = [5 * i for i in range(1, 6)]                             # changing index as per the requirements

In [11]:
df

Unnamed: 0,Name,Marks,City
5,Lorem,81,Sollicitudin
10,Ipsum,92,Dignissim
15,Dolor,85,Suspendisse
20,Sit Amet,87,Pellentesque
25,Voluptatem,96,Efficitur


In [12]:
[type(df['Name']), type(df)]                                # series vs dataframe

[pandas.core.series.Series, pandas.core.frame.DataFrame]

In [13]:
# generating a random series
ser = pd.Series(np.random.rand(34))

In [14]:
ser

0     0.533195
1     0.769249
2     0.411873
3     0.563049
4     0.617291
5     0.639724
6     0.592298
7     0.782093
8     0.535233
9     0.916628
10    0.086878
11    0.759165
12    0.178085
13    0.237794
14    0.940479
15    0.821607
16    0.098544
17    0.056202
18    0.909083
19    0.960868
20    0.351020
21    0.602913
22    0.062805
23    0.164223
24    0.688124
25    0.686829
26    0.254855
27    0.712641
28    0.493764
29    0.878928
30    0.508715
31    0.236381
32    0.296574
33    0.370135
dtype: float64

In [15]:
type(ser)

pandas.core.series.Series

In [16]:
# creating a random dataframe
dafr = pd.DataFrame(np.random.rand(300, 5), index=np.arange(300))

In [17]:
dafr

Unnamed: 0,0,1,2,3,4
0,0.532254,0.901541,0.428067,0.834881,0.399360
1,0.677715,0.052650,0.161786,0.135735,0.541248
2,0.655530,0.400686,0.216088,0.619329,0.477689
3,0.511622,0.549976,0.254571,0.876576,0.907613
4,0.623958,0.667805,0.093308,0.632558,0.355616
5,0.546309,0.113880,0.858330,0.549393,0.639319
6,0.855396,0.027858,0.870869,0.984981,0.125304
7,0.601596,0.522100,0.953205,0.598733,0.566938
8,0.350522,0.803077,0.287948,0.944318,0.764139
9,0.752988,0.430127,0.108329,0.599917,0.618606


In [18]:
type(dafr)

pandas.core.frame.DataFrame

In [19]:
dafr.dtypes                                               # returns the column wise data type

0    float64
1    float64
2    float64
3    float64
4    float64
dtype: object

In [20]:
dafr[0][0] = 'lorem'

In [21]:
dafr.dtypes                                               # multiple dtype values in same column will generate an object series

0     object
1    float64
2    float64
3    float64
4    float64
dtype: object

In [22]:
dafr.index                                            # returns indices

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            290, 291, 292, 293, 294, 295, 296, 297, 298, 299],
           dtype='int64', length=300)

In [23]:
dafr.columns

RangeIndex(start=0, stop=5, step=1)

In [24]:
# converting dataframe to numpy array
dafr1 = dafr.to_numpy()

In [25]:
[type(dafr), type(dafr1)]

[pandas.core.frame.DataFrame, numpy.ndarray]

In [26]:
# sorting dataframe with index
dafr.sort_index(axis=0, ascending=False).head()

Unnamed: 0,0,1,2,3,4
299,0.908226,0.468186,0.900876,0.197394,0.357243
298,0.87925,0.552482,0.900695,0.443929,0.329581
297,0.370154,0.232883,0.125087,0.387765,0.360222
296,0.02938,0.734258,0.63299,0.991994,0.020098
295,0.629341,0.616104,0.805038,0.678918,0.267112


In [27]:
dafr2 = dafr                                                        # not copying instead indicating same location

In [28]:
dafr2[0][0] = "ipsum"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [29]:
dafr2.head()

Unnamed: 0,0,1,2,3,4
0,ipsum,0.901541,0.428067,0.834881,0.39936
1,0.677715,0.05265,0.161786,0.135735,0.541248
2,0.65553,0.400686,0.216088,0.619329,0.477689
3,0.511622,0.549976,0.254571,0.876576,0.907613
4,0.623958,0.667805,0.093308,0.632558,0.355616


In [30]:
dafr.head()

Unnamed: 0,0,1,2,3,4
0,ipsum,0.901541,0.428067,0.834881,0.39936
1,0.677715,0.05265,0.161786,0.135735,0.541248
2,0.65553,0.400686,0.216088,0.619329,0.477689
3,0.511622,0.549976,0.254571,0.876576,0.907613
4,0.623958,0.667805,0.093308,0.632558,0.355616


In [31]:
# copying one dataframe inside other
dafr3 = dafr.copy()

In [32]:
dafr3[0][0] = 'Dolor'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [33]:
dafr3.head()

Unnamed: 0,0,1,2,3,4
0,Dolor,0.901541,0.428067,0.834881,0.39936
1,0.677715,0.05265,0.161786,0.135735,0.541248
2,0.65553,0.400686,0.216088,0.619329,0.477689
3,0.511622,0.549976,0.254571,0.876576,0.907613
4,0.623958,0.667805,0.093308,0.632558,0.355616


In [34]:
dafr.head()

Unnamed: 0,0,1,2,3,4
0,ipsum,0.901541,0.428067,0.834881,0.39936
1,0.677715,0.05265,0.161786,0.135735,0.541248
2,0.65553,0.400686,0.216088,0.619329,0.477689
3,0.511622,0.549976,0.254571,0.876576,0.907613
4,0.623958,0.667805,0.093308,0.632558,0.355616


In [35]:
dafr.loc[0, 0] = 'amet'                                    # genuine method

In [36]:
dafr.head()

Unnamed: 0,0,1,2,3,4
0,amet,0.901541,0.428067,0.834881,0.39936
1,0.677715,0.05265,0.161786,0.135735,0.541248
2,0.65553,0.400686,0.216088,0.619329,0.477689
3,0.511622,0.549976,0.254571,0.876576,0.907613
4,0.623958,0.667805,0.093308,0.632558,0.355616


In [37]:
dafr.loc[0, 5] = 456

In [38]:
dafr.head()

Unnamed: 0,0,1,2,3,4,5
0,amet,0.901541,0.428067,0.834881,0.39936,456.0
1,0.677715,0.05265,0.161786,0.135735,0.541248,
2,0.65553,0.400686,0.216088,0.619329,0.477689,
3,0.511622,0.549976,0.254571,0.876576,0.907613,
4,0.623958,0.667805,0.093308,0.632558,0.355616,


In [39]:
# deletes an entire columns
dafr = dafr.drop(5, axis=1)

In [40]:
dafr.head()

Unnamed: 0,0,1,2,3,4
0,amet,0.901541,0.428067,0.834881,0.39936
1,0.677715,0.05265,0.161786,0.135735,0.541248
2,0.65553,0.400686,0.216088,0.619329,0.477689
3,0.511622,0.549976,0.254571,0.876576,0.907613
4,0.623958,0.667805,0.093308,0.632558,0.355616


In [41]:
dafr.reset_index(drop=True, inplace=True)                                     # resets the index