# 10. 行数のカウント

In [1]:
#01
with open("data/popular-names.txt", "r") as text:
    lines = text.readlines()
    print(len(lines))

2740


In [2]:
#02
with open("data/popular-names.txt", "r") as text:
    contents = text.read()
    print(contents.count('\n'))

2740


### Shell

In [3]:
#先頭に!をつけるとシェルコマンドが使える
#!tldr wc

In [4]:
!wc -l data/popular-names.txt

2740 data/popular-names.txt


### 10 memo

In [5]:
#↑上記の実装だと、メモリ的に心配
#リストを作成、Stringを作成しているため。
#一文ずつ捨てちゃうのがよい
#最後の改行
#使わない変数は(_)を使う
#diff , headは便利

#改良ver
with open("data/popular-names.txt", "r") as text:
    print(sum(1 for _ in text.readlines()))

2740


# 11.タブをスペースに置換

In [6]:
import itertools
with open("data/popular-names.txt", "r") as text:
    ls = text.readlines()
    for line in itertools.islice(ls,5):
        print(line.replace("\t"," "),end="")

Mary F 7065 1880
Anna F 2604 1880
Emma F 2003 1880
Elizabeth F 1939 1880
Minnie F 1746 1880


### Shell

In [7]:
#!tldr sed

In [8]:
!sed -r 's/\t/ /g' data/popular-names.txt | head

Mary F 7065 1880
Anna F 2604 1880
Emma F 2003 1880
Elizabeth F 1939 1880
Minnie F 1746 1880
Margaret F 1578 1880
Ida F 1472 1880
Alice F 1414 1880
Bertha F 1320 1880
Sarah F 1288 1880
sed: couldn't write 21 items to stdout: Broken pipe


In [9]:
#!tldr tr

In [10]:
!tr '\t' ' ' < data/popular-names.txt | head

Mary F 7065 1880
Anna F 2604 1880
Emma F 2003 1880
Elizabeth F 1939 1880
Minnie F 1746 1880
Margaret F 1578 1880
Ida F 1472 1880
Alice F 1414 1880
Bertha F 1320 1880
Sarah F 1288 1880
tr: write error: Broken pipe


In [11]:
#!man expand

In [12]:
!expand -t 1 data/popular-names.txt | head

Mary F 7065 1880
Anna F 2604 1880
Emma F 2003 1880
Elizabeth F 1939 1880
Minnie F 1746 1880
Margaret F 1578 1880
Ida F 1472 1880
Alice F 1414 1880
Bertha F 1320 1880
Sarah F 1288 1880
expand: write error: Broken pipe
expand: write error


### 11 memo

In [13]:
#print(,end='')を忘れかけていた
#itertools.islice(f,int)
#ipython上で%timeit,%%file

# 12. 1列目をcol1.txtに，2列目をcol2.txtに保存

In [14]:
with open("data/popular-names.txt","r") as text:
    col1 = open("work/col1.txt","w")
    col2 = open("work/col2.txt","w")

    lines = text.readlines()

    for line in lines:
        tmp = line.split()
        col1.write(tmp[0]+'\n')
        col2.write(tmp[1]+'\n')

    col1.close()
    col2.close()

### Shell

In [15]:
#!man cut

In [16]:
!cut -f 1 data/popular-names.txt > work/col1-2.txt

In [17]:
!cut -f 2 data/popular-names.txt > work/col2-2.txt

### 結果比較

In [18]:
!diff work/col1.txt work/col1-2.txt
!diff work/col2.txt work/col2-2.txt

### 12memo

In [19]:
# with open(_) as _ f1 , \
#      open(_) as _ f2 , \
#      open(_) as _ f3:
# 繋げられる

x, y, z ,w = 'I have a pen'.split()
print(y)

*rest , a = 'I have a pen'.split()
print(rest)

first, *rest, last = range(10)
print(rest)
print(*rest)

# *のことをiterable unpacking operatorという


have
['I', 'have', 'a']
[1, 2, 3, 4, 5, 6, 7, 8]
1 2 3 4 5 6 7 8


# 13. col1.txtとcol2.txtをマージ

In [20]:
col1 = open("work/col1.txt","r")
col2 = open("work/col2.txt","r")
new = open("work/q013.txt","w")

lines1 = col1.readlines()
lines2 = col2.readlines()
for L1,L2 in zip(lines1,lines2):
    new.write(L1.strip("\n")+'\t'+L2)

new.close()
col1.close()
col2.close()

# lines1とlines2をそのままcol1とcol2と書いてもよい
# リストでメモリを使いすぎやで

In [21]:
!wc -l "work/col1.txt"
!wc -l "work/col2.txt"

2740 work/col1.txt
2740 work/col2.txt


### Shell

In [22]:
#!tldr paste

In [23]:
!paste work/col1.txt work/col2.txt > work/q013-2.txt

### 比較

In [24]:
!diff work/q013.txt work/q013-2.txt

### 13memo

In [25]:
# paste → 横
# cat → 縦

# 14. 先頭からN行を出力

In [26]:
%%file src/q014.py
import sys
args = sys.argv

if len(args) != 2: #error1
    print("usage : q014.py [number]")
elif not args[1].isdigit() or int(args[1]) <= 0: #error2
    print("usage : number is natural number")
else:
    with open("data/popular-names.txt","r") as text: 
        lines = text.readlines()
        for line in lines[:int(args[1])]:
            print(line,end="")

Overwriting src/q014.py


In [27]:
!python src/q014.py 3

Mary	F	7065	1880
Anna	F	2604	1880
Emma	F	2003	1880


### Shell

In [28]:
#!tldr head

In [29]:
!head -n 3 data/popular-names.txt

Mary	F	7065	1880
Anna	F	2604	1880
Emma	F	2003	1880


### 比較

In [30]:
%%bash
diff <(head -n 3 data/popular-names.txt) <(python src/q014.py 3)

### 14memo

In [31]:
import itertools
line_num = int(input("Line Num : "))
with open("data/popular-names.txt") as f:
    for _ in itertools.islice(f,line_num):
        print(_,end='')

Mary	F	7065	1880
Anna	F	2604	1880
Emma	F	2003	1880


In [32]:
#chmod

# argparseは便利
# 　→主に引数の設定等で便利
# main()関数を作ると便利 if _name_ == '_main_'

# 15. 末尾のN行を出力

In [33]:
%%file src/q015.py
import sys
args = sys.argv

if len(args) != 2: #error1
    print("usage : q015.py [number]")
elif not args[1].isdigit(): #error2
    print("usage : number is natural number")
else:
    with open("data/popular-names.txt","r") as text:
        lines = text.readlines()
        for line in lines[-int(args[1]):]:
            print(line.strip('\n'))

Overwriting src/q015.py


In [34]:
!python src/q015.py 3

Michael	M	13998	2016
Elijah	M	13764	2016
Ethan	M	13758	2016


### Shell

In [35]:
#!tldr tail

In [36]:
!tail -n 3 data/popular-names.txt

Michael	M	13998	2016
Elijah	M	13764	2016
Ethan	M	13758	2016


### 比較

In [37]:
%%bash
diff <(tail -n 3 data/popular-names.txt) <(python src/q015.py 3)

### 15memo

In [38]:
import collections
n = int(input("line num : "))
with open('data/popular-names.txt') as f:
    print(''.join(collections.deque(f, maxlen=n)))

Michael	M	13998	2016
Elijah	M	13764	2016
Ethan	M	13758	2016



In [39]:
#more_itertoolsは便利
import more_itertools
with open('data/popular-names.txt') as f:
    print(''.join(more_itertools.tail(iterable=f, n=n)))

Michael	M	13998	2016
Elijah	M	13764	2016
Ethan	M	13758	2016



# 16. ファイルをN分割する

自然数Nをコマンドライン引数などの手段で受け取り，入力のファイルを行単位でN分割せよ．同様の処理をsplitコマンドで実現せよ．

ex) 23行を5つに分配するとき、
    - パターン1:[5,5,5,4,4]
    - パターン2:[5,5,5,5,3]

### code1
メモリが...

In [40]:
%%file src/q016.py
import sys,itertools

def filename(i): #ファイル命名
    return 'work/x'+chr(97+i//26)+chr(97+i%26)

def div_numlist(length,N): #何行ごとに区切るか
    return [length//N+1 if i < length%N else length//N for i in range(N)] #パターン1

args = sys.argv

if len(args) != 2: #error1
    print("usage : q016.py [number]")
elif not args[1].isdigit() or int(args[1]) <= 0: #error2
    print("usage : number is natural number")
else:
    with open("data/popular-names.txt","r") as text:
        lines = text.readlines() #メモリの浪費
        if(len(lines) < int(args[1])): #error3
            print("Number is too big.")
        else:
            ls = list(div_numlist(len(lines),int(args[1]))) #メモリの浪費2
            m = 0
            for i in range(int(args[1])):
                newfile = open(filename(i),"w") 
                newfile.writelines(lines[m:m+ls[i]])
                m += ls[i]
                newfile.close()

Overwriting src/q016.py


In [41]:
!python src/q016.py 3

### code2
今まで学んだものを使って

In [42]:
%%file src/q016_2.py

import argparse,itertools

def len_iterable(iterable): #iterableの長さを返す
    return sum(1 for _ in iterable)

def parts_len(length,pieces):#何行ごとに区切るか,generator関数
    t = length//pieces
    return (t+1 if length%pieces else t for i in range(pieces)) #パターン2

def filename(i): #ファイル命名
    return 'work/x'+chr(97+i//26)+chr(97+i%26)

def main():
    length = len_iterable(args.file)
    args.file.seek(0)
    for i,n in enumerate(parts_len(length,args.pieces)):
        with open(filename(i),"w") as newfile:
            for line in itertools.islice(args.file,n):
                newfile.writelines(line)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description = 'Split a file into N pieces')
    parser.add_argument("file",
                        type=argparse.FileType('r'),
                        )
    parser.add_argument("-n","--pieces",
                        type=int,
                        default=3,
                       )
    args = parser.parse_args()
    
    main()

Overwriting src/q016_2.py


In [43]:
!python3 src/q016_2.py -n 3 "data/popular-names.txt"

In [44]:
!wc -l work/xaa
!wc -l work/xab
!wc -l work/xac

914 work/xaa
914 work/xab
912 work/xac


## Shell

In [45]:
#!tldr split

### Shell Only

In [46]:
%%bash
div=3
length=$(wc -l data/popular-names.txt | cut -d' ' -f1)
if [ `expr $length % $div` -ne 0 ]
then
    arg=`expr $length / $div + 1`
else
    arg=`expr $length / $div`
fi
split -l $arg data/popular-names.txt

### 混合ver

In [47]:
!wc -l data/popular-names.txt

2740 data/popular-names.txt


In [48]:
div = 3
length = !(wc -l data/popular-names.txt|cut -d' ' -f1)
length = int(length[0])
arg = length//div if length%div is 0 else length//div+1
!split -l $arg data/popular-names.txt

In [49]:
!wc -l xaa
!wc -l xab
!wc -l xac

914 xaa
914 xab
912 xac


## 比較

In [50]:
!diff work/xaa xaa
!diff work/xab xab
!diff work/xac xac

In [51]:
text1 = """
aaaa
bbbb
cccc
"""

text2 = """
aaaa
dddd
cccc
"""

## memo

In [52]:
from collections import deque
#divmod
a = 10
b = 3
p,q = divmod(a,b) #商と余りがとれる
print(p,q)

# deque　コンテナデータ構造
with open("data/popular-names.txt","r") as f:
    lines = deque(f)

# popleft 左取り出せる
print(lines.popleft())#左
print(lines.pop())#右

#shell
!echo "10 / 4"
!echo "10 / 4" | bc
#bcは計算を処理するコマンド(exprもある)
#ダブルクォートの中の式は展開される（シングルクォートはされない）

#more_itertools by横井さん
# with open('data/popular-names.txt', 'r') as f:
#     iters = more_itertools.divide(N_Q16, f)
#     for it in iters:
#         print(len(tuple(it)))

3 1
Mary	F	7065	1880

Ethan	M	13758	2016

10 / 4
2


# 17. １列目の文字列の異なり
1列目の文字列の種類（異なる文字列の集合）を求めよ．確認にはcut, sort, uniqコマンドを用いよ．

In [53]:
with open("data/popular-names.txt","r") as text:
    col1_set = {line.split()[0] for line in text}
    print(col1_set)

{'Emily', 'Crystal', 'Nicholas', 'Barbara', 'Marie', 'Tammy', 'Ethel', 'Kathleen', 'Kelly', 'Nicole', 'Liam', 'Amy', 'Joshua', 'Ethan', 'Alexander', 'Jayden', 'Elizabeth', 'Betty', 'Mark', 'Edward', 'Tracy', 'Clara', 'Jacob', 'Shirley', 'James', 'Taylor', 'Joseph', 'Emma', 'Ruth', 'Joan', 'Minnie', 'Julie', 'Laura', 'Elijah', 'John', 'George', 'Virginia', 'Matthew', 'Isabella', 'Charles', 'Jeffrey', 'Helen', 'Andrew', 'Donna', 'Mary', 'Jason', 'Sharon', 'Cynthia', 'Pamela', 'Brian', 'Olivia', 'Bessie', 'Walter', 'Brandon', 'Sarah', 'Carol', 'Aiden', 'Anthony', 'Heather', 'Frank', 'Austin', 'William', 'Lillian', 'Justin', 'Sandra', 'Doris', 'Dorothy', 'Daniel', 'Jennifer', 'Gary', 'Ronald', 'Madison', 'Lori', 'Anna', 'Hannah', 'Susan', 'Rebecca', 'Samantha', 'Larry', 'Richard', 'Melissa', 'Abigail', 'Margaret', 'Bertha', 'Tyler', 'Kimberly', 'Evelyn', 'Carolyn', 'Thomas', 'Steven', 'Donald', 'Harry', 'Ashley', 'Megan', 'Deborah', 'Florence', 'Debra', 'Alexis', 'Rachel', 'Nancy', 'Robert

### Shell

In [54]:
!cut -f1 data/popular-names.txt | sort | uniq | head

Abigail
Aiden
Alexander
Alexis
Alice
Amanda
Amy
Andrew
Angela
Anna


## memo

In [55]:
#sort -u で　sortとuniqができる
!cut -f1 data/popular-names.txt | sort -u | head

Abigail
Aiden
Alexander
Alexis
Alice
Amanda
Amy
Andrew
Angela
Anna


# 18. 各行を3コラム目の数値の降順にソート
各行を3コラム目の数値の逆順で整列せよ（注意: 各行の内容は変更せずに並び替えよ）．確認にはsortコマンドを用いよ（この問題はコマンドで実行した時の結果と合わなくてもよい）．

In [56]:
import itertools
with open("data/popular-names.txt","r") as text:
    dic = {int(line.split()[2]) : line for line in text}
    for key in itertools.islice(sorted(dic,reverse=True),10):
        print(dic[key],end='')

Linda	F	99685	1947
Linda	F	96210	1948
James	M	94762	1947
Michael	M	92716	1957
Robert	M	91641	1947
Linda	F	91013	1949
Michael	M	90620	1956
Michael	M	90512	1958
James	M	88584	1948
Michael	M	88525	1954


## Shell

In [57]:
#!man sort

In [58]:
!sort -k3 -r -n data/popular-names.txt | head

Linda	F	99685	1947
Linda	F	96210	1948
James	M	94762	1947
Michael	M	92716	1957
Robert	M	91641	1947
Linda	F	91013	1949
Michael	M	90620	1956
Michael	M	90512	1958
James	M	88584	1948
Michael	M	88525	1954


## memo

In [59]:
#!sortは-sで安定ソートになる(shell)

import itertools
with open("data/popular-names.txt","r") as text:
    for line in itertools.islice(sorted((line for line in text),key=lambda line : int(line.split()[2]),reverse=True),10):
        print(line,end='')

Linda	F	99685	1947
Linda	F	96210	1948
James	M	94762	1947
Michael	M	92716	1957
Robert	M	91641	1947
Linda	F	91013	1949
Michael	M	90620	1956
Michael	M	90512	1958
James	M	88584	1948
Michael	M	88525	1954


# 19. 各行の1コラム目の文字列の出現頻度を求め，出現頻度の高い順に並べる
各行の1列目の文字列の出現頻度を求め，その高い順に並べて表示せよ．確認にはcut, uniq, sortコマンドを用いよ．

### パターン1
ifで判断

In [60]:
import itertools

dic = {}
with open("data/popular-names.txt","r") as text:
    for line in text:
        key,*rest = line.split()
        dic[key] = 1+dic[key] if key in dic else 1

for key,v in itertools.islice(sorted(dic.items(),key=lambda x:-x[1]),10):
    print(v,key)

116 James
109 William
108 John
108 Robert
92 Mary
75 Charles
74 Michael
73 Elizabeth
71 Joseph
60 Margaret


### パターン2
ifしない

In [61]:
import itertools

dic = {i : 0 for i in col1_set}
with open("data/popular-names.txt","r") as text:
    for line in text:
        dic[line.split()[0]]+=1
        
for key,v in itertools.islice(sorted(dic.items(),key=lambda x:-x[1]),10):
    print(v,key)

116 James
109 William
108 John
108 Robert
92 Mary
75 Charles
74 Michael
73 Elizabeth
71 Joseph
60 Margaret


### memo

In [62]:
from collections import Counter
from pprint import pprint
with open("data/popular-names.txt","r") as f:
    count = Counter(line.split()[0] for line in f)
    pprint(count.most_common(10))

[('James', 116),
 ('William', 109),
 ('John', 108),
 ('Robert', 108),
 ('Mary', 92),
 ('Charles', 75),
 ('Michael', 74),
 ('Elizabeth', 73),
 ('Joseph', 71),
 ('Margaret', 60)]


## Shell

In [63]:
!cut -f1 data/popular-names.txt | sort | uniq -c | sort -nr | head

    116 James
    109 William
    108 Robert
    108 John
     92 Mary
     75 Charles
     74 Michael
     73 Elizabeth
     71 Joseph
     60 Margaret


In [64]:
#!tldr uniq