
## Working with files


### String Formatter

In [98]:
name = 'KGP Talkie is my youtube channel name'
follower = '30k'

In [99]:
print('name', 'follower')
print(name, follower)

name follower
KGP Talkie is my youtube channel name 30k


In [100]:
ds = [('python', 50), ('tensorflow', 100), ('nlp', 200)]

In [101]:
print(ds)

[('python', 50), ('tensorflow', 100), ('nlp', 200)]


In [102]:
for info in ds:
  print(info[0], info[1])

python 50
tensorflow 100
nlp 200


In [103]:
for info in ds:
  print(f'{info[0]:{20}} {info[1]}')

python               50
tensorflow           100
nlp                  200


In [104]:
#>, <, ^
for info in ds:
  print(f'{info[0]:{20}} {info[1]:.>{5}}')

python               ...50
tensorflow           ..100
nlp                  ..200


### Working with files in write / append mode

In [105]:
file = open('data.txt', 'w')

data = 'this is sentence one'
file.write(data)
file.close()

In [106]:
len(data)

20

In [107]:
file = open('data.txt', 'a')

data = '\t this is sentence three'
file.write(data)
file.close()

In [108]:
file = open('data1.txt', 'a') # a if file is there then r+w otherwise w

data = 1
file.write(str(data))
file.close()

In [109]:
file = open('data2.txt', 'w') # a if file is there then r+w otherwise w

data = [1, 'one', 'this is two', 2.3]
for d in data:
    file.write(str(d))
    file.write(',')
file.close()

In [110]:
with open('data3.txt', 'w') as file:
    data = [1, 'one', 'this is two', 2.3]
    for d in data:
        file.write(str(d))
        file.write('\n')

### Working with files in read mode.

In [111]:
file = open('data3.txt', 'r')

In [112]:
file.seek(0)
file.read().splitlines()

['1', 'one', 'this is two', '2.3']

In [113]:
file.seek(0)
file.readlines()

['1\n', 'one\n', 'this is two\n', '2.3\n']

In [114]:
file.seek(0)
data = file.read().splitlines()
file.close()

In [115]:
data

['1', 'one', 'this is two', '2.3']

In [116]:
eval(data[-1])

2.3

In [117]:
for index, d in enumerate(data):
    try:
        data[index] = eval(d)
    except:
        pass

In [118]:
data

[1, 'one', 'this is two', 2.3]

### Reading and Writing .CSV and .TSV Files with Pandas 

In [119]:
import pandas as pd

In [120]:
pd.read_csv('data3.txt', header= None)

Unnamed: 0,0
0,1
1,one
2,this is two
3,2.3


In [121]:
l = [(1, 'one'), (2, 'two'), (3, 'three')]
df = pd.DataFrame(l, columns=['digit', 'figure'])
df.to_csv('digit.csv', sep = ',', index = False)

In [122]:
pd.read_csv('digit.csv')

Unnamed: 0,digit,figure
0,1,one
1,2,two
2,3,three


In [123]:
pd.read_csv('moviereviews.tsv', sep = '\t')

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...
...,...,...
1995,pos,"i like movies with albert brooks , and i reall..."
1996,pos,it might surprise some to know that joel and e...
1997,pos,the verdict : spine-chilling drama from horror...
1998,pos,i want to correct what i wrote in a former ret...


In [124]:
df.to_csv('digit.tsv', sep = '\t', index = None)


### Reading & Writing .XLSX Files with Pandas

In [125]:
%pip install openpyxl
%pip install xlsxwriter

Note: you may need to restart the kernel to use updated packages.



In [126]:
df.to_excel('digit_sheet.xlsx', index = None, sheet_name='digit')

In [127]:
df.to_excel('digit_sheet.xlsx', index = None, sheet_name='digit1')

In [128]:
writer = pd.ExcelWriter('digit_sheet.xlsx', engine = 'xlsxwriter')
df.to_excel(writer, index = None, sheet_name='digit1')
df.to_excel(writer, index = None, sheet_name='digit2')
writer.save()
writer.close()

  writer.save()
  warn("Calling close() on already closed file.")


### Reading & Writing .json files

In [129]:
import json

In [130]:
data_dict = {"one": "1", "two":"2"}

In [131]:
type(data_dict)

dict

In [132]:
data_str = '{"one": "1", "two":"2"}'
type(data_str)

str

In [133]:
#load(), loads(), dump(), dumps()

In [134]:
json.loads(data_str)

{'one': '1', 'two': '2'}

In [135]:
json.dumps(data_dict)

'{"one": "1", "two": "2"}'

In [136]:
file = open('data.json', 'w')
json.dump(data_str, file)
file.close()

In [137]:
file = open('data.json', 'r')
json_data = json.load(file)
file.close

<function TextIOWrapper.close()>

In [138]:
json.loads(json_data)

{'one': '1', 'two': '2'}

### Reading files from URL links

https://datahub.io/core/global-temp/r/monthly.json

https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/ecommerce.csv


In [139]:
pd.read_csv('https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/ecommerce.csv')

Unnamed: 0,ID,order_date,delivery_date
0,1,5/24/98,2/5/99
1,2,4/22/92,3/6/98
2,4,2/10/91,8/26/92
3,5,7/21/92,11/20/97
4,7,9/2/93,6/10/98
...,...,...,...
496,990,6/24/91,2/2/96
497,991,9/9/91,3/30/98
498,993,11/16/90,4/27/98
499,994,6/3/93,6/13/93


In [140]:
pd.read_json('https://datahub.io/core/global-temp/r/monthly.json')

HTTPError: HTTP Error 404: Not Found


### Extract data from PDF

In [141]:
%pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
   -------------------------------------- 232.6/232.6 kB 712.6 kB/s eta 0:00:00
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Note: you may need to restart the kernel to use updated packages.


In [143]:
import PyPDF2 as pdf

In [144]:
file = open('NLP.pdf', 'rb')

In [146]:
reader = pdf.PdfReader(file)

In [147]:
reader

<PyPDF2._reader.PdfReader at 0x2874ca9cdf0>

In [148]:
help(reader)

Help on PdfReader in module PyPDF2._reader object:

class PdfReader(builtins.object)
 |  PdfReader(stream: Union[str, IO, pathlib.Path], strict: bool = False, password: Union[NoneType, str, bytes] = None) -> None
 |  
 |  Initialize a PdfReader object.
 |  
 |  This operation can take some time, as the PDF stream's cross-reference
 |  tables are read into memory.
 |  
 |  :param stream: A File object or an object that supports the standard read
 |      and seek methods similar to a File object. Could also be a
 |      string representing a path to a PDF file.
 |  :param bool strict: Determines whether user should be warned of all
 |      problems and also causes some correctable problems to be fatal.
 |      Defaults to ``False``.
 |  :param None/str/bytes password: Decrypt PDF file at initialization. If the
 |      password is None, the file will not be decrypted.
 |      Defaults to ``None``
 |  
 |  Methods defined here:
 |  
 |  __init__(self, stream: Union[str, IO, pathlib.Path], 

In [152]:
reader.is_encrypted

False

In [154]:
reader.metadata

{'/ModDate': 'D:20060227152126Z',
 '/CreationDate': 'D:20060227151709Z',
 '/Title': '#n',
 '/Creator': 'Acrobat PDFMaker 6.0 for Word',
 '/Producer': 'Acrobat Distiller 6.0 (Windows)',
 '/Author': 's',
 '/SourceModified': 'D:20060227151632'}

In [157]:
len(reader.pages)

19

In [160]:
page1  = reader.pages[0].extract_text()

In [161]:
page1

'Lkit: A Toolkit for Natua ral La ngua ge Interface Construction \n2. Natural Language Processing (NLP) \nThis section provides a br ief histor y of NLP, in troduces so me of t he main problem s involved  \nin extracting meaning from hu man lan guages a nd exa mines the kind of activities perfor med \nby NLP s ystems. \n \n \n2.1. Background \nNatural language processing s ystem s take strings  of words  (sentences) as their input and \nproduce struc tured representations capturing the meaning of those strings as their output. The  \nnature of this  outp ut depends heavil y on the task at hand. A natural language understandin g \nsystem  serving as an interf ace to a dat abase might accept questions in English w hich relate t o \nthe kind of data held by  the databas e. In this case  the meaning  of the input (the output of the \nsystem ) might be expressed  in terms of structured SQL queries which c an be directly \nsubm itted to the database. \n \nThe first use of com puters to m ani

In [165]:
with open('pdf_text.txt', 'w', encoding='utf-8') as file1:
    for i in range(len(reader.pages)):
        page  = reader.pages[i].extract_text()
        file1.write(page)
        file1.write('\n')