In [None]:
from google.colab import drive
import pandas as pd
import torch
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import io
import re
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split


In [None]:
# Loading CSV
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df_train = pd.read_csv('/content/drive/My Drive/MLProject/train.csv') 
df_test = pd.read_csv('/content/drive/My Drive/MLProject/test.csv')  

In [None]:
# Examine data set
print(df_train.head())
print(df_train['full_text'][2])

        text_id                                          full_text  cohesion  \
0  0016926B079C  I think that students would benefit from learn...       3.5   
1  0022683E9EA5  When a problem is a change you have to let it ...       2.5   
2  00299B378633  Dear, Principal\n\nIf u change the school poli...       3.0   
3  003885A45F42  The best time in life is when you become yours...       4.5   
4  0049B1DF5CCC  Small act of kindness can impact in other peop...       2.5   

   syntax  vocabulary  phraseology  grammar  conventions  
0     3.5         3.0          3.0      4.0          3.0  
1     2.5         3.0          2.0      2.0          2.5  
2     3.5         3.0          3.0      3.0          2.5  
3     4.5         4.5          4.5      4.0          5.0  
4     3.0         3.0          3.0      2.5          2.5  
Dear, Principal

If u change the school policy of having a grade b average that unfair. Because many students have a C average. So that means that they cant go out f

In [None]:
# Data pre-processing
# remove '\n \r \w'
df_train['full_text'] = df_train["full_text"].replace(re.compile(r'[\n\r\t]'), '', regex=True)
df_test['full_text'] = df_test["full_text"].replace(re.compile(r'[\n\r\t]'), '', regex=True)
df_train['full_text'] = df_train["full_text"].replace(re.compile(r'[^\w]'), ' ', regex=True)
print(df_train['full_text'][5])

Dear Principal Our school should have a community center  The reasons why  are so students can learn what our community needs  how to make our community better place  and why is community important for students to know  Its a great to have a community center to know how we can make things better Students think community center takes their time away  but they have to learn what our community needs  students will participate in a group of students making a list what our community needs  therefore students will learn what our community needs  students will present their list of things our community needs  due to that students will be giving extra credit for the ones who have low grades Some students don t participate because their friends say its waste of time  it would not be waste of time when you get to know how our community can be a better place for us  students should know that the program is about our own lives  because if our community is bad well our lives are going to be bad  du

In [None]:
# tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
train_token = df_train['full_text'].apply(word_tokenize)
test_token = df_test['full_text'].apply(word_tokenize)

print(train_token[0])

['I', 'think', 'that', 'students', 'would', 'benefit', 'from', 'learning', 'at', 'home', 'because', 'they', 'wont', 'have', 'to', 'change', 'and', 'get', 'up', 'early', 'in', 'the', 'morning', 'to', 'shower', 'and', 'do', 'there', 'hair', 'taking', 'only', 'classes', 'helps', 'them', 'because', 'at', 'there', 'house', 'they', 'll', 'be', 'pay', 'more', 'attention', 'they', 'will', 'be', 'comfortable', 'at', 'home', 'The', 'hardest', 'part', 'of', 'school', 'is', 'getting', 'ready', 'you', 'wake', 'up', 'go', 'brush', 'your', 'teeth', 'and', 'go', 'to', 'your', 'closet', 'and', 'look', 'at', 'your', 'cloths', 'after', 'you', 'think', 'you', 'picked', 'a', 'outfit', 'u', 'go', 'look', 'in', 'the', 'mirror', 'and', 'youll', 'either', 'not', 'like', 'it', 'or', 'you', 'look', 'and', 'see', 'a', 'stain', 'Then', 'you', 'll', 'have', 'to', 'change', 'with', 'the', 'online', 'classes', 'you', 'can', 'wear', 'anything', 'and', 'stay', 'home', 'and', 'you', 'wont', 'need', 'to', 'stress', 'abou

In [None]:
# assign x and y
X = df_train['full_text']
y = df_train[['cohesion','syntax','vocabulary','phraseology','grammar','conventions']]

#split train test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) 

print(X_train[0])
print(y.shape)

I think that students would benefit from learning at home because they wont have to change and get up early in the morning to shower and do there hair  taking only classes helps them because at there house they ll be pay more attention  they will be comfortable at home The hardest part of school is getting ready  you wake up go brush your teeth and go to your closet and look at your cloths  after you think you picked a outfit u go look in the mirror and youll either not like it or you look and see a stain  Then you ll have to change  with the online classes you can wear anything and stay home and you wont need to stress about what to wear most students usually take showers before school  they either take it before they sleep or when they wake up  some students do both to smell good  that causes them do miss the bus and effects on there lesson time cause they come late to school  when u have online classes u wont need to miss lessons cause you can get everything set up and go take a sho

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

train_seq = tokenizer.texts_to_sequences(X_train)
pad_train = pad_sequences(train_seq, maxlen=1250, truncating='post')

val_seq = tokenizer.texts_to_sequences(X_val)
pad_train = pad_sequences(val_seq, maxlen=1250, truncating='post')

test_seq = tokenizer.texts_to_sequences(X_test)
pad_test = pad_sequences(test_seq, maxlen=1250, truncating='post') #max length of word is 1250

print(pad_train[0])

[   0    0    0 ... 2683   11  650]
