# **Selecting Columns**

In [49]:
import pandas as pd
import numpy as np

In [50]:
df_csv = pd.read_csv("StudentsPerformance.csv")

In [51]:
df_csv.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [52]:
# select a column with [] (preferred way to select a column)
df_csv['math score']

0      72
1      69
2      90
3      47
4      76
       ..
995    88
996    62
997    59
998    68
999    77
Name: math score, Length: 1000, dtype: int64

In [53]:
df_csv['gender']

0      female
1      female
2      female
3        male
4        male
        ...  
995    female
996      male
997    female
998    female
999    female
Name: gender, Length: 1000, dtype: object

In [54]:
type(df_csv["gender"])

pandas.core.series.Series

In [55]:
# Series: attributes and methods
df_csv['math score'].dtype

dtype('int64')

In [56]:
df_csv["math score"].head()


0    72
1    69
2    90
3    47
4    76
Name: math score, dtype: int64

In [57]:
df_csv["math score"].mean()

np.float64(66.089)

In [58]:
df_csv["math score"].shape

(1000,)

In [59]:
df_csv["math score"].head()
df_csv["math score"].mean()
df_csv["math score"].std()
df_csv["math score"].max()
df_csv["math score"].min()
df_csv["math score"].describe()

count    1000.00000
mean       66.08900
std        15.16308
min         0.00000
25%        57.00000
50%        66.00000
75%        77.00000
max       100.00000
Name: math score, dtype: float64

In [60]:
# select a column with dot notation
df_csv.gender

0      female
1      female
2      female
3        male
4        male
        ...  
995    female
996      male
997    female
998    female
999    female
Name: gender, Length: 1000, dtype: object

In [61]:
# select a column with dot notation (pitfalls)
# df_csv.math score
# SyntaxError: invalid syntax
# This happens because of the space in the column name
# To avoid this, we should use the [] notation
# df_csv.math_score  # This will work if the column name has no spaces

# Selecting Two or More Columns from a DataFrame

In [62]:
# selecting 2 columns using [[]]
df_csv[["gender", "math score"]]  

Unnamed: 0,gender,math score
0,female,72
1,female,69
2,female,90
3,male,47
4,male,76
...,...,...
995,female,88
996,male,62
997,female,59
998,female,68


In [63]:
type(df_csv[["gender", "math score"]])

pandas.core.frame.DataFrame

In [64]:
df_csv[["math score", "reading score", "writing score", "gender"]]

Unnamed: 0,math score,reading score,writing score,gender
0,72,72,74,female
1,69,90,88,female
2,90,95,93,female
3,47,57,44,male
4,76,78,75,male
...,...,...,...,...
995,88,99,95,female
996,62,55,55,male
997,59,71,65,female
998,68,78,77,female


In [65]:
# Adding New Column to DataFrame

# adding a new column with a scalar value

df_csv["New Column"] = 100
df_csv.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,New Column
0,female,group B,bachelor's degree,standard,none,72,72,74,100
1,female,group C,some college,standard,completed,69,90,88,100
2,female,group B,master's degree,standard,none,90,95,93,100
3,male,group A,associate's degree,free/reduced,none,47,57,44,100
4,male,group C,some college,standard,none,76,78,75,100


In [66]:
# Adding a new column with  an array
language_score = np.arange(0,1000)
# len(np.arange(0,1000))

In [67]:
df_csv["language score"] = language_score
df_csv

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,New Column,language score
0,female,group B,bachelor's degree,standard,none,72,72,74,100,0
1,female,group C,some college,standard,completed,69,90,88,100,1
2,female,group B,master's degree,standard,none,90,95,93,100,2
3,male,group A,associate's degree,free/reduced,none,47,57,44,100,3
4,male,group C,some college,standard,none,76,78,75,100,4
...,...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95,100,995
996,male,group C,high school,free/reduced,none,62,55,55,100,996
997,female,group C,high school,free/reduced,completed,59,71,65,100,997
998,female,group D,some college,standard,completed,68,78,77,100,998


In [68]:
np.random.randint(1,100, size=1000)

array([16, 67, 39, 53, 42, 66, 63, 66, 13, 95, 83, 52, 15, 86, 28, 42,  5,
       36,  3, 56, 65, 40, 97, 84, 99, 57, 69, 29, 11, 95, 58, 22,  1, 20,
        1, 17, 47, 96, 87,  6, 54, 63, 95, 31, 57, 87, 30, 81, 65, 69, 32,
       81, 28, 16, 38, 67, 43, 78, 63, 19, 27, 74, 97, 57,  8, 36, 40, 88,
        6, 97, 39, 26, 69, 56, 99, 95, 24, 17, 82, 88, 22, 39, 85, 58, 93,
       91,  3, 11, 76, 11, 54, 23, 18, 49, 33, 97, 31, 35, 86, 19, 20, 11,
       88, 66, 86, 81, 42, 16, 73, 15, 26, 15, 27, 66,  4, 84, 85,  4, 94,
       73, 30, 52, 23, 99, 50, 44, 87, 35, 11, 66, 17, 35, 20, 58, 71, 98,
       36, 38, 61, 51, 36, 35, 63, 89, 70, 86, 76, 16, 61, 25, 90, 12, 68,
       87, 99, 17, 65, 75,  8, 78, 95, 70, 44, 39, 15, 67, 20, 82, 34,  5,
       50, 14, 23, 50, 26, 94, 95, 61, 29, 54, 98, 45, 14, 44, 28, 23, 55,
       94, 37, 22, 28, 21, 10, 47, 39, 93, 24, 34, 42, 44, 73, 65, 41, 11,
       57, 33, 50, 74, 44, 39, 85, 74, 50, 44, 73, 81, 21, 83, 12, 96,  2,
       11, 52, 48, 50, 37

In [69]:
int_language_score = np.random.randint(1,101, size=1000)
max(int_language_score)

np.int32(100)

In [70]:
df_csv["New Language Score"] = int_language_score
df_csv

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,New Column,language score,New Language Score
0,female,group B,bachelor's degree,standard,none,72,72,74,100,0,7
1,female,group C,some college,standard,completed,69,90,88,100,1,30
2,female,group B,master's degree,standard,none,90,95,93,100,2,36
3,male,group A,associate's degree,free/reduced,none,47,57,44,100,3,55
4,male,group C,some college,standard,none,76,78,75,100,4,43
...,...,...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95,100,995,32
996,male,group C,high school,free/reduced,none,62,55,55,100,996,50
997,female,group C,high school,free/reduced,completed,59,71,65,100,997,43
998,female,group D,some college,standard,completed,68,78,77,100,998,28


In [71]:
# creating random float numbers
np.random.uniform(1, 100, size=1000)

array([70.54563001, 90.25139917, 34.08359501, 93.50307509, 13.78907218,
       84.62514463,  4.37832197, 75.68068242, 81.01840209, 32.98807536,
       61.13334237,  7.15091866, 72.08475131, 74.42180638, 92.58920754,
       14.71681561, 64.32239674, 85.55856018, 84.004279  , 19.65219743,
       39.44291945, 81.68103347, 54.19688491, 76.39732577, 62.87709308,
       72.28602157, 71.39253094, 75.29233218, 40.22800541, 64.85612754,
       27.71850075, 75.32389226, 85.78662091, 38.66588083, 30.72253999,
       77.93115189, 37.1290508 , 24.23391273, 45.20447875, 78.59347275,
       39.87874994,  8.2443578 , 29.19732465, 51.52841047,  2.00576052,
       76.84294852, 43.99913265, 32.97733287,  2.97847188, 95.7224074 ,
       90.71544375, 68.44126425, 18.90418983, 92.90609347, 72.57166642,
       54.56467602, 26.51983106, 61.0373191 , 77.54198088, 46.28160446,
       83.66936226, 16.58037506, 10.75089825, 90.89715914, 66.21514109,
       48.51579519, 78.57516613, 34.00138939, 86.48465158,  6.15

In [72]:
# Assign a new column using assign() and insert() methods
# You use the assign method when you want to add multiple columns at once
# You use the insert method when you want to add a column at a specific position
# What of we do not declare the size? 
# It will throw an error because the length of the new column must match the length of the DataFrame

# df_csv = df_csv.assign(
#     science_score = np.random.randint(1,101, size=1000),
#     social_studies_score = np.random.randint(1,101, size=1000)
# )

# df_csv.head()

 # When to use assign 

. Add multiple columns in a single line of code


. When you need to overwrite the values of an existing columns(best practice)


It returns a new object(a copy) with all the original columns in addition to the new ones

In [73]:
score1 = np.random.randint(1,100, size =1000)
score2 = np.random.randint(1,100, size =1000)

In [74]:
series1 = pd.Series(score1, index = np.arange(0,1000))
series2 = pd.Series(score2, index = np.arange(0,1000))

In [75]:
# using assign method to add multiple columns
df_csv.assign(score1 = series1, score2 = series2)
df_csv.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,New Column,language score,New Language Score
0,female,group B,bachelor's degree,standard,none,72,72,74,100,0,7
1,female,group C,some college,standard,completed,69,90,88,100,1,30
2,female,group B,master's degree,standard,none,90,95,93,100,2,36
3,male,group A,associate's degree,free/reduced,none,47,57,44,100,3,55
4,male,group C,some college,standard,none,76,78,75,100,4,43
