In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


### Creating a simple dataframe

In [None]:
#columns are column names
#index are row names

pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]], columns = ["A", "B", "C"], index= ["R1", "R2", "R3"])

Unnamed: 0,A,B,C
R1,1,2,3
R2,4,5,6
R3,7,8,9


In [None]:
#creating a dataframe from a dictionary

dictionary = {"A": [1,2,3], "B": [4,5,6], "C": [7,8,9]}

pd.DataFrame(dictionary)

Unnamed: 0,A,B,C
0,1,4,7
1,2,5,8
2,3,6,9


### Loading our csv file into pandas and storing its data into a dataframe called df

In [None]:
cols =  ["fLength", "fWidth", "fSize", "fConc", "fConc1", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", "class"]

df = pd.read_csv("/Users/mahamfarooq/Desktop/magic+gamma+telescope/magic04.data", names = cols)

df

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.0110,-8.2027,40.0920,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.2610,g
2,162.0520,136.0310,4.0612,0.0374,0.0187,116.7410,-64.8580,-45.2160,76.9600,256.7880,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.4490,116.7370,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.6480,356.4620,g
...,...,...,...,...,...,...,...,...,...,...,...
19015,21.3846,10.9170,2.6161,0.5857,0.3934,15.2618,11.5245,2.8766,2.4229,106.8258,h
19016,28.9452,6.7020,2.2672,0.5351,0.2784,37.0816,13.1853,-2.9632,86.7975,247.4560,h
19017,75.4455,47.5305,3.4483,0.1417,0.0549,-9.3561,41.0562,-9.4662,30.2987,256.5166,h
19018,120.5135,76.9018,3.9939,0.0944,0.0683,5.8043,-93.5224,-63.8389,84.6874,408.3166,h


### Some attributes of dataframes

In [None]:
df.head()
df.tail()
df.info()
df.select_dtypes("object")
df.shape
df.describe(include = "object")
df.columns

Index(['fLength', 'fWidth', 'fSize', 'fConc', 'fConc1', 'fAsym', 'fM3Long',
       'fM3Trans', 'fAlpha', 'fDist', 'class'],
      dtype='object')

## Dataframe Indexing

### Without df.loc

In [None]:
#fetching a single column

df[["fWidth"]]

Unnamed: 0,fWidth
0,16.0021
1,11.7235
2,136.0310
3,9.5728
4,30.9205
...,...
19015,10.9170
19016,6.7020
19017,47.5305
19018,76.9018


In [None]:
#fetching multiple columns

df[["fWidth", 'class']]

Unnamed: 0,fWidth,class
0,16.0021,g
1,11.7235,g
2,136.0310,g
3,9.5728,g
4,30.9205,g
...,...,...
19015,10.9170,h
19016,6.7020,h
19017,47.5305,h
19018,76.9018,h


In [None]:
#fetches a boolean series

df["class"] == "g"

0         True
1         True
2         True
3         True
4         True
         ...  
19015    False
19016    False
19017    False
19018    False
19019    False
Name: class, Length: 19020, dtype: bool

In [None]:
#fetches all the rows from the dataframe where a boolean series is True

df[df["class"] == "g"]

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.0110,-8.2027,40.0920,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.2610,g
2,162.0520,136.0310,4.0612,0.0374,0.0187,116.7410,-64.8580,-45.2160,76.9600,256.7880,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.4490,116.7370,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.6480,356.4620,g
...,...,...,...,...,...,...,...,...,...,...,...
12327,12.8703,11.4444,2.3811,0.7360,0.3805,-15.0946,5.3032,11.6208,21.0120,204.0370,g
12328,26.8595,20.5946,2.8754,0.3438,0.2152,-3.4556,-20.0014,-9.0535,3.9848,205.4980,g
12329,22.0913,10.8949,2.2945,0.5381,0.2919,15.2776,18.2296,7.3975,21.0680,123.2810,g
12330,56.2216,18.7019,2.9297,0.2516,0.1393,96.5758,-41.2969,11.3764,5.9110,197.2090,g


In [None]:
#first indexing the fWidth column from the dataframe and then fetching rows 1 through 100 in steps of ten
#[1:100:10] is [start:stop:step]

df["fWidth"][1:100:10]

1     11.7235
11    16.4600
21    21.8287
31    26.7866
41    11.3962
51    23.0143
61    18.4512
71    25.9259
81    33.6384
91    23.0735
Name: fWidth, dtype: float64

### With df.loc

In [None]:
#df.loc[rows, columns]

df.loc[1:100, "fWidth"]

1       11.7235
2      136.0310
3        9.5728
4       30.9205
5       21.1502
         ...   
96      16.2843
97      15.6976
98      11.7563
99      25.6580
100     11.2825
Name: fWidth, Length: 100, dtype: float64

In [None]:
#all rows from start to end in steps of 100
#'fWidth', 'fSize', and 'fConc' from the columns

df.loc[::100, ['fWidth', 'fSize', 'fConc']]

Unnamed: 0,fWidth,fSize,fConc
0,16.0021,2.6449,0.3918
100,11.2825,3.0071,0.3119
200,17.5010,2.8011,0.2735
300,16.4943,2.8109,0.3246
400,49.7480,4.3237,0.0584
...,...,...,...
18600,45.4409,3.4272,0.0875
18700,89.9911,4.2954,0.1221
18800,16.1675,3.0979,0.2890
18900,21.0873,2.7320,0.3441


In [None]:
#all rows where df["fWidth"] > 30 is True
#'fWidth', 'fSize' and 'fConc' from the columns

df.loc[df["fWidth"] > 30, ['fWidth', 'fSize', 'fConc']]

Unnamed: 0,fWidth,fSize,fConc
2,136.0310,4.0612,0.0374
4,30.9205,3.1611,0.3168
8,46.5165,4.1540,0.0779
14,33.1061,3.1944,0.4679
20,71.8818,3.8484,0.0780
...,...,...,...
19011,44.9929,3.5488,0.1656
19013,76.5568,3.6872,0.1123
19017,47.5305,3.4483,0.1417
19018,76.9018,3.9939,0.0944


In [None]:
#rows 1 to hundred, columns fLength to fConc1

df.loc[1:100, 'fLength':'fConc1']

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1
1,31.6036,11.7235,2.5185,0.5303,0.3773
2,162.0520,136.0310,4.0612,0.0374,0.0187
3,23.8172,9.5728,2.3385,0.6147,0.3922
4,75.1362,30.9205,3.1611,0.3168,0.1832
5,51.6240,21.1502,2.9085,0.2420,0.1340
...,...,...,...,...,...
96,23.4494,16.2843,2.5605,0.4539,0.2655
97,32.0586,15.6976,2.5072,0.3826,0.2037
98,18.4475,11.7563,2.4074,0.5714,0.4051
99,41.4001,25.6580,3.0414,0.2036,0.1023
