# Manipulating Text data

### Import pandas

In [1]:
import pandas as pd
from pathlib import Path
path = Path('C:/Users/alouden01/Documents/python-pandas-for-data-manipulation/starter_files/skill_13_text_data_basics/StudentsPerformance.csv')

### Read CSV
In this set of videos we'll use a modified version of the [Student Performance](https://www.kaggle.com/spscientist/students-performance-in-exams) dataset from Kaggle. This dataset contains student perfomance to analyze the influence of parent backgrounds, test preparations, etc on students performance.

1. gender and test preparation course are upprcase
2. all scores are strings that need % removed
3. all lunch values have a space preceding each entry (hard to see)

Explore dataset and optimize by converting gender, race, lunch and test prep into categories.

In [2]:
students = pd.read_csv(path)
students.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,FEMALE,group B,bachelor's degree,standard,NONE,72%,72%,74%
1,FEMALE,group C,some college,standard,COMPLETED,69%,90%,88%
2,FEMALE,group B,master's degree,standard,NONE,90%,95%,93%
3,MALE,group A,associate's degree,free/reduced,NONE,47%,57%,44%
4,MALE,group C,some college,standard,NONE,76%,78%,75%


In [3]:
students.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   object
 6   reading score                1000 non-null   object
 7   writing score                1000 non-null   object
dtypes: object(8)
memory usage: 62.6+ KB


In [4]:
students.nunique()
students_copy = students.copy()

In [5]:
students[['gender','race/ethnicity','parental level of education', 'lunch', 'test preparation course']].astype('category')
cat_list = ['gender','race/ethnicity','parental level of education', 'lunch', 'test preparation course']
for c in cat_list:
    students[c] = students[c].astype('category')

In [6]:
students.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   gender                       1000 non-null   category
 1   race/ethnicity               1000 non-null   category
 2   parental level of education  1000 non-null   category
 3   lunch                        1000 non-null   category
 4   test preparation course      1000 non-null   category
 5   math score                   1000 non-null   object  
 6   reading score                1000 non-null   object  
 7   writing score                1000 non-null   object  
dtypes: category(5), object(3)
memory usage: 29.2+ KB


In [7]:
for column in students:
    print(column)
    students[column] = students[column].str.lower()
students.head()

gender
race/ethnicity
parental level of education
lunch
test preparation course
math score
reading score
writing score


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group b,bachelor's degree,standard,none,72%,72%,74%
1,female,group c,some college,standard,completed,69%,90%,88%
2,female,group b,master's degree,standard,none,90%,95%,93%
3,male,group a,associate's degree,free/reduced,none,47%,57%,44%
4,male,group c,some college,standard,none,76%,78%,75%


In [8]:
students.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   object
 6   reading score                1000 non-null   object
 7   writing score                1000 non-null   object
dtypes: object(8)
memory usage: 62.6+ KB


In [9]:
students['race/ethnicity'] = students['race/ethnicity'].str.replace(' ', '_')
students['parental level of education'] = students['parental level of education'].str.replace(' ', '_')
students.head()


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group_b,bachelor's_degree,standard,none,72%,72%,74%
1,female,group_c,some_college,standard,completed,69%,90%,88%
2,female,group_b,master's_degree,standard,none,90%,95%,93%
3,male,group_a,associate's_degree,free/reduced,none,47%,57%,44%
4,male,group_c,some_college,standard,none,76%,78%,75%


In [10]:
for c in students:
    students[c] = students[c].str.replace(' ','')
students.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group_b,bachelor's_degree,standard,none,72%,72%,74%
1,female,group_c,some_college,standard,completed,69%,90%,88%
2,female,group_b,master's_degree,standard,none,90%,95%,93%
3,male,group_a,associate's_degree,free/reduced,none,47%,57%,44%
4,male,group_c,some_college,standard,none,76%,78%,75%


In [11]:

for c in students:
    students[c] = students[c].str.replace('%','')
students.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group_b,bachelor's_degree,standard,none,72,72,74
1,female,group_c,some_college,standard,completed,69,90,88
2,female,group_b,master's_degree,standard,none,90,95,93
3,male,group_a,associate's_degree,free/reduced,none,47,57,44
4,male,group_c,some_college,standard,none,76,78,75


In [12]:
for c in students:
    students = students.rename(columns={f'{c}':f'{c}'.replace(' ','_')})
students.head()

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group_b,bachelor's_degree,standard,none,72,72,74
1,female,group_c,some_college,standard,completed,69,90,88
2,female,group_b,master's_degree,standard,none,90,95,93
3,male,group_a,associate's_degree,free/reduced,none,47,57,44
4,male,group_c,some_college,standard,none,76,78,75


In [15]:
# for c in cat_list:
#     students[c] = students[c].astype('category')
students_copy.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   object
 6   reading score                1000 non-null   object
 7   writing score                1000 non-null   object
dtypes: object(8)
memory usage: 62.6+ KB


In [16]:

sc = students_copy.convert_dtypes()
sc.info()
"students"

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   string
 1   race/ethnicity               1000 non-null   string
 2   parental level of education  1000 non-null   string
 3   lunch                        1000 non-null   string
 4   test preparation course      1000 non-null   string
 5   math score                   1000 non-null   string
 6   reading score                1000 non-null   string
 7   writing score                1000 non-null   string
dtypes: string(8)
memory usage: 62.6 KB


In [23]:
students.head()

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group_b,bachelor's_degree,standard,none,72,72,74
1,female,group_c,some_college,standard,completed,69,90,88
2,female,group_b,master's_degree,standard,none,90,95,93
3,male,group_a,associate's_degree,free/reduced,none,47,57,44
4,male,group_c,some_college,standard,none,76,78,75


In [27]:
students.to_csv(path.parent/'my_students_lower.csv',index=False)

In [26]:
students['reading_score'] = students['reading_score'].astype(int)
int_cols = ['math_score','reading_score', 'writing_score']
for c in students[int_cols]:
    students[c] = students[c].astype(int)
# students.head()
students.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   gender                       1000 non-null   category
 1   race/ethnicity               1000 non-null   category
 2   parental_level_of_education  1000 non-null   object  
 3   lunch                        1000 non-null   object  
 4   test_preparation_course      1000 non-null   object  
 5   math_score                   1000 non-null   int64   
 6   reading_score                1000 non-null   int64   
 7   writing_score                1000 non-null   int64   
dtypes: category(2), int64(3), object(3)
memory usage: 49.3+ KB


In [27]:
students.nunique()

gender                          2
race/ethnicity                  5
parental_level_of_education     6
lunch                           2
test_preparation_course         2
math_score                     81
reading_score                  72
writing_score                  77
dtype: int64