# Importing libraries

In [1]:
import spacy
import csv
import pandas as pd
import matplotlib
import numpy as np
import re

# Data Collection

### Sentence Segmentation

In [2]:
#Load English Language Model
nlp = spacy.load("en_core_web_sm")

In [3]:
#Input path of the text file
path=input('Enter the path of your file') or 'text.txt'

Enter the path of your file


In [4]:
#Initializing arrays for data collection in csv file
sentence=[]
reqFormatList=[]

In [5]:
#Reading lines in the text file and storing them in lines
with open(path,'r') as f:
    lines = f.read()

In [6]:
#Applying the Language model on the lines read from the text file
doc=nlp(lines)

In [7]:
#Performing Sentence Segmentation and storing the files in the array sentence
for sent in doc.sents:
    sentence.append(sent)

### Note:  Doc.sents is a generator
A generator object can not produce output/segmented untill it is called. For example if there is a list object so you can iterate over it and print item one by one. Also you can print the output using indexing like list[0], list[1] etc without calling it explicitly means even without looping over it.
But this is not the case with generator objects. So the Doc is not segmented until doc.sents is called. This means that, where you could print the second Doc token (word token) with print(doc[1]), you can't call the "second Doc sentence" with print(doc.sents[1]):

In [8]:
#Storing each sentence as an array value in reqFormatList
for i in sentence:
    reqFormatList.append([i])

In [9]:
#Add path to CSV file
Path_to_CSV=input() or 'Data.csv'




In [10]:
#Storing each sentence in a cell in a csv file
header=['Sentences']
with open(Path_to_CSV,'w',newline='') as f:
    writer=csv.writer(f,delimiter=',')
    writer.writerow(header)
    for s in reqFormatList:
        writer.writerow(s)

### The data is now collected in a csv file and is now ready for basic pre-processing.

# Data Pre-Processing

In [11]:
#The first step is importing the data as a CSV file
df= pd.read_csv(Path_to_CSV)

In [12]:
df

Unnamed: 0,Sentences
0,Lightning lit up the purple sky of the mysteri...
1,"The sky was a dark purple with grey clouds, bo..."
2,"The land was parched and cracking, without any..."
3,They could barely see a single tree around them.
4,"In the distance, the adventurers can see a pal..."
5,\nIn front of the adventurers was a tall figur...
6,He did not speak.
7,"He had pale straw-colored skin, with white spi..."
8,"His hands seemed other worldly, with some kind..."
9,His two scimitars left a trail of shadows as h...


### Removing Unwanted Spaces 

In [13]:
'''This function removes all the unwanted spaces in the data
    For example, it changes I 'm to I'm 
                            ca n't to can't... '''
def remove_spaces(text):
    text = re.sub(r" '(\w)",r"'\1",text)
    text = re.sub(r" \,",",",text)
    text = re.sub(r" \.+",".",text)
    text = re.sub(r" \!+","!",text)
    text = re.sub(r" \?+","?",text)
    text = re.sub(" n't","n't",text)
    text = re.sub("[\(\)\;\_\^\`\/]","",text)
    
    return text

### Expanding the words

In [14]:
#This function expands all the shortened words, for example, changing can't to Can not
def decontract(text):
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text

### Other Pre-Processing Steps

In [15]:
def preprocess(text):
    text = re.sub("\n","",text) # REMOVING "\n"
    text = remove_spaces(text)   # REMOVING UNWANTED SPACES
    text = re.sub(r"\.+",".",text) 
    text = re.sub(r"\!+","!",text)
    text = decontract(text)    # DECONTRACTION
    text = re.sub("[^A-Za-z0-9 ]+","",text) ## REMOVING UNWANTED CHARACTERS
    text = text.lower() ## CONVERTING TO LOWER CASE
    return text ## RETURNING THE PROCESSED TEXT

### Applying Functions to Data Frame 

In [16]:
for i, row in df.iterrows():
    df.at[i, 'Sentences'] = preprocess(row['Sentences'])