In [70]:
# Importing modules
import pandas as pd
from pandas.api.types import is_string_dtype
import re
import numpy as np

#Importing spell chacking and language detection packages
from enchant.checker import SpellChecker
from langdetect import detect

#importing necessary nltk packages
from nltk.tokenize import word_tokenize

# Cleaning the Data

This section includes: removing the duplicate rows from the data, spelling correction.
The rows of data that the refelction column has nonsense content-these are identifed by finding the row with unusually large number of words and removed. The rows with non-english content are also detected and removed.


In [50]:
data = pd.read_csv('StudentReflectionStrategy.csv')
data.shape

(184835, 4)

In [51]:
data.drop_duplicates(subset =["content", "top_1_score", "top_2_score"], 
                     keep = 'first', inplace = True)

In [52]:
data.count()

UseCase        104350
content        104350
top_1_score    104350
top_2_score    104350
dtype: int64

In [8]:
#data.to_csv("data_no_duplicate")

In [5]:
#from enchant.checker import SpellChecker

In [53]:
def spell_check(text):       
    '''
    spell_check: function for correcting the spelling of the reflections
    Expects:  a string
    Returns: a list
    '''
    Corr_RF = []
    #Grab each individual reflection
    for refl in text.split():
        #Check to see if the words are in the dictionary
        chkr = SpellChecker("en_US", refl)
        for err in chkr:
            #for the identified errors or words not in dictionary get the suggested correction
            #and replace it in the reflection string
            if len(err.suggest()) > 0:
                sug = err.suggest()[0]
                err.replace(sug)
        Corr_RF.append(chkr.get_text())
        #return the dataframe with the new corrected reflection column
    return ' '.join(Corr_RF)

In [54]:
#Correcting the misspelled
data['Corrected_content'] = data.content.apply(spell_check)


In [None]:
#data.to_csv("data_no_duplicate_spell_checked")

In [77]:
#data.head()

In [78]:
#corrected refelction added to the dataframe
ref_list = list(data['Corrected_content'])


In [113]:
def word_count(text):
    '''
    word_count: function for counting the number of the words in the reflections
    Expects:  a string
    Returns: a list of words count in each reflection
    '''
    refl_wordcount = []
    for refl in text:
        refl_wordcount.append(len(word_tokenize(refl)))
    return refl_wordcount

def lang_detect(text):
    '''
    lang_detect: function for detecting the languauge of the reflections
    Expects: a string
    Returns: a list of the detected languages
    '''
    lang = []
    for refl in text:
        lang.append(detect(refl))
    return lang


In [114]:
#new coulmns word_counts and language of the reflections are added to dataframe
data['word_counts']  = word_count(ref_list)
data['lang']  = lang_detect(ref_list)

In [134]:
#data.head()

In [116]:
#reflections are sorted based on their number of words
data_new= data.sort_values('word_counts', ascending=False)
#data_new.head()


Unnamed: 0,UseCase,content,top_1_score,top_2_score,Corrected_content,word_counts,lang
122266,0,Bee Movie Script - Dialogue Transcript Voila! ...,4,3,Bee Movie Script - Dialogged Transcript Viola!...,12747,en
147563,1,second semester i will work on paying more att...,3,4,second semester i will work on paying more att...,2785,so
158969,1,"i finally did something to learn. type A,typ...",1,3,"i finally did something to learn. type A,type ...",1294,en
25267,0,This article is about the cat species that is ...,3,4,This article is about the cat species that is ...,856,en
161852,1,"Ahhh! Such a beautiful day, uh (yea) The sun’s...",4,3,"Ahoy! Such a beautiful day, uh (yea) The sun’s...",787,en


I read the first 25 refelections which had the highest number of the words and noticed that most of them cotain irrelevant content inclduing a movie script.

In [117]:
#list of the index that identified as irrelevant content:
list_to_removed = [122266, 147563, 158969, 25267,161852, 43573, 30258, 95833, 37911, 25318, 124847, 97110, 25298, 93031, 39079, 
                   106633, 92032, 50680, 128034]

In [118]:
#rows with irrelevant content are removed
data_new.drop(list_to_removed, inplace = True)
data_new.head()

Unnamed: 0,UseCase,content,top_1_score,top_2_score,Corrected_content,word_counts,lang
161835,1,I will take notes and not talk. I will pay att...,2,3,I will take notes and not talk. I will pay att...,703,en
153306,1,I understand sense we do these notes it hel...,4,3,I understand sense we do these notes it helps ...,634,en
166028,1,"Over the course of the six weeks, I was consta...",4,3,"Over the course of the six weeks, I was consta...",566,en
145501,1,I feel like I made my best improvements in exp...,3,4,I feel like I made my best improvements in exp...,407,en
28786,0,*The knowledge checks and quizzes prepared me...,4,3,*The knowledge checks and quizzes prepared me ...,403,en


In [120]:
data_new.shape

(104331, 7)

In [122]:
#row of data with non-English content removed
data_newer = data_new.drop(data_new[(data_new.lang =='es')].index)
data_newer.shape

(104221, 7)

In [135]:
#data_newer.head()

In [131]:
data_newer.reset_index(drop=True)

Unnamed: 0,UseCase,content,top_1_score,top_2_score,Corrected_content,word_counts,lang
0,1,I will take notes and not talk. I will pay att...,2,3,I will take notes and not talk. I will pay att...,703,en
1,1,I understand sense we do these notes it hel...,4,3,I understand sense we do these notes it helps ...,634,en
2,1,"Over the course of the six weeks, I was consta...",4,3,"Over the course of the six weeks, I was consta...",566,en
3,1,I feel like I made my best improvements in exp...,3,4,I feel like I made my best improvements in exp...,407,en
4,0,*The knowledge checks and quizzes prepared me...,4,3,*The knowledge checks and quizzes prepared me ...,403,en
...,...,...,...,...,...,...,...
104216,0,re-read the question,2,3,re-read the question,3,en
104217,0,the questions,2,3,the questions,2,fr
104218,0,the questions,2,3,the questions,2,fr
104219,1,asked questions,2,3,asked questions,2,fr


In [136]:
#new dataframe from cleaned data saved as 'final_cleaned_data'
data_newer.to_csv('final_cleaned_data')

# Extracting the reflections with scores 3 or 4
As these refelctions are expected to have richer content in terms of the topics of learning strategies.

In [150]:
cleaned_data = pd.read_csv('final_cleaned_data')
#cleaned_data.tail()

In [138]:
cleaned_data['average_score'] = (cleaned_data['top_1_score'] + cleaned_data['top_2_score'])/2

In [140]:
is_highest = cleaned_data['average_score']==3.5 

In [141]:
data_highest=cleaned_data[is_highest]

In [148]:
#data_highest.count()

In [147]:
#data_highest.info()

In [146]:
data_highest_new = data_highest.drop(['Unnamed: 0', 'UseCase'], axis = 1)
#data_highest_new.head()

In [145]:
data_highest_new.to_csv('highest_cleaned')

 a dataframe form cleaned and spell corrected refelctions with average score of 3.5 saved as 'highest_cleaned'
 which later on will be used for topic modeling