# 3.4 문자열 데이터 처리

In [1]:
import pandas as pd
titanic = pd.read_csv('datasets/train.csv')
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
titanic.drop(['PassengerId', 'Cabin'], axis=1, inplace=True)

In [4]:
# 문자열 변수인 Name 데이터 타입 확인
titanic.Name.dtype

dtype('O')

In [5]:
# 좀 더 안정적인 string 타입으로 변경
titanic.Name = titanic.Name.astype("string")
titanic.Name.dtype

string[python]

## 3.4.1 문자열 분리하기

In [6]:
titanic.Name.str.split()

0                           [Braund,, Mr., Owen, Harris]
1      [Cumings,, Mrs., John, Bradley, (Florence, Bri...
2                             [Heikkinen,, Miss., Laina]
3      [Futrelle,, Mrs., Jacques, Heath, (Lily, May, ...
4                          [Allen,, Mr., William, Henry]
                             ...                        
886                            [Montvila,, Rev., Juozas]
887                    [Graham,, Miss., Margaret, Edith]
888       [Johnston,, Miss., Catherine, Helen, "Carrie"]
889                           [Behr,, Mr., Karl, Howell]
890                              [Dooley,, Mr., Patrick]
Name: Name, Length: 891, dtype: object

In [7]:
# 기호로 묶인 경우가 있어 기호를 중심으로 단어를 분리하기 위해 pat 활용
titanic.Name.str.split(pat=",")

0                             [Braund,  Mr. Owen Harris]
1      [Cumings,  Mrs. John Bradley (Florence Briggs ...
2                              [Heikkinen,  Miss. Laina]
3        [Futrelle,  Mrs. Jacques Heath (Lily May Peel)]
4                            [Allen,  Mr. William Henry]
                             ...                        
886                             [Montvila,  Rev. Juozas]
887                      [Graham,  Miss. Margaret Edith]
888          [Johnston,  Miss. Catherine Helen "Carrie"]
889                             [Behr,  Mr. Karl Howell]
890                               [Dooley,  Mr. Patrick]
Name: Name, Length: 891, dtype: object

In [8]:
# expand 문자가 분리되는 만큼 개별 컬럼 생성
titanic.Name.str.split(expand=True)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,"Braund,",Mr.,Owen,Harris,,,,,,,,,,
1,"Cumings,",Mrs.,John,Bradley,(Florence,Briggs,Thayer),,,,,,,
2,"Heikkinen,",Miss.,Laina,,,,,,,,,,,
3,"Futrelle,",Mrs.,Jacques,Heath,(Lily,May,Peel),,,,,,,
4,"Allen,",Mr.,William,Henry,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,"Montvila,",Rev.,Juozas,,,,,,,,,,,
887,"Graham,",Miss.,Margaret,Edith,,,,,,,,,,
888,"Johnston,",Miss.,Catherine,Helen,"""Carrie""",,,,,,,,,
889,"Behr,",Mr.,Karl,Howell,,,,,,,,,,


In [9]:
# 두 번째 위치한 데이터만 접근하기
titanic.Name.str.split().str[1]

0        Mr.
1       Mrs.
2      Miss.
3       Mrs.
4        Mr.
       ...  
886     Rev.
887    Miss.
888    Miss.
889      Mr.
890      Mr.
Name: Name, Length: 891, dtype: object

In [10]:
# title 이라는 새 컬럼을 생성하여 호칭 저장
titanic['title'] = titanic.Name.str.split().str[1]

## 3.4.2 문잣값 교체하기

In [11]:
titanic.title.value_counts().head()

title
Mr.        502
Miss.      179
Mrs.       121
Master.     40
Dr.          7
Name: count, dtype: int64

In [12]:
# 마침표를 공백으로 처리
titanic.title.str.replace('.', '', regex=False)

0        Mr
1       Mrs
2      Miss
3       Mrs
4        Mr
       ... 
886     Rev
887    Miss
888    Miss
889      Mr
890      Mr
Name: title, Length: 891, dtype: object

In [13]:
titanic['title'] = titanic.title.str.replace('.', '', regex=False)
titanic.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,title
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,Mr
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,Mrs
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,Miss
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,Mrs
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,Mr


In [14]:
titanic.title.value_counts()

title
Mr              502
Miss            179
Mrs             121
Master           40
Dr                7
Rev               6
y                 4
Planke,           3
Impe,             3
Gordon,           2
Col               2
Mlle              2
Major             2
Melkebeke,        1
Jonkheer          1
Shawah,           1
the               1
Velde,            1
Capt              1
Messemaeker,      1
Carlo,            1
Ms                1
Mme               1
Steen,            1
Mulder,           1
Pelsmaeker,       1
Walle,            1
der               1
Billiard,         1
Don               1
Cruyssen,         1
Name: count, dtype: int64

In [15]:
# 호칭 중에서 Mlle, Ms, Mme 등은 Miss, Mrs와 같은 의미로 문잣값을 변경한다
titanic['title'] = titanic['title'].str.replace('Mlle', 'Miss', regex=False)
titanic['title'] = titanic['title'].str.replace('Ms', 'Miss', regex=False)
titanic['title'] = titanic['title'].str.replace('Mme', 'Mrs', regex=False)

In [18]:
rare_name = ['Dr', 'Rev', 'y', 'Planke,', 'Impe,', 'Gordon,', 'Col', 'Major', 'Melkebeke,', 'Jonkheer',
             'Shawah,', 'the', 'Velde,', 'Capt', 'Messemaeker,', 'Carlo,', 'Steen,', 'Mulder,', 'Pelsmaeker,',
             'Walle,', 'der', 'Billiard,', 'Don', 'Cruyssen,']
titanic['title'] = titanic['title'].replace(rare_name, 'Rare', regex=False)
titanic['title'].value_counts()

title
Mr        502
Miss      182
Mrs       122
Rare       45
Master     40
Name: count, dtype: int64

## 3.4.3 정규 표현식 가이드 - 추후 수정

In [20]:
# 이름이 Mr 뿐 아니라 Mrs도 출력된다
titanic[titanic['Name'].str.contains('Mr')].head(10)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,title
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,Mr
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,Mrs
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,Mrs
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,Mr
5,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,Q,Mr
6,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,S,Mr
8,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,S,Mrs
9,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,C,Mrs
12,0,3,"Saundercock, Mr. William Henry",male,20.0,0,0,A/5. 2151,8.05,S,Mr
13,0,3,"Andersson, Mr. Anders Johan",male,39.0,1,5,347082,31.275,S,Mr


## 3.4.4 문자 수 세기

In [24]:
# 공백 포함 모든 문자 수 세기
titanic['Name'].str.count('')

0      24
1      52
2      23
3      45
4      25
       ..
886    22
887    29
888    41
889    22
890    20
Name: Name, Length: 891, dtype: Int64

In [25]:
# 단어 수 세기 공백 수 +1 하면 총 단어 수와 일치
titanic['Name'].str.count('')+1

0      25
1      53
2      24
3      46
4      26
       ..
886    23
887    30
888    42
889    23
890    21
Name: Name, Length: 891, dtype: Int64

In [26]:
# 특정 수만 세기
titanic['Name'].str.count('a')

0      2
1      2
2      2
3      3
4      1
      ..
886    2
887    4
888    2
889    1
890    1
Name: Name, Length: 891, dtype: Int64