In [1]:
# import packages
import pandas as pd
import os
import glob
import random
import numpy as np
import statistics
import warnings
warnings.filterwarnings('ignore')

from scipy.stats import ttest_rel
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
import matplotlib.patches as mpatches

import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import statsmodels.formula.api as smf
from scipy import stats
from scipy.stats import norm

## Data Prep

In [4]:
path = r'/Users/spencersu/Desktop/Rockdale'

# create a dataframe for word count
wordcount = pd.read_csv(os.path.join(path, 'Rockdale word count 19-22.csv'))
# remove column: the first column "Unnamed: 0"
wordcount = wordcount.iloc[:,1:17]

# wordcnt dataset now has 239253 rows × 16 columns
# data retrieved on Jan 20, 2022

In [5]:
wordcount

Unnamed: 0,Submission Date Date,School Year,School Name,Class Name,Class Subject,Teacher Name [Sensitive],Student ID,Full Name,Enrolled Grade,Lesson ID,Writing Lesson Category,Student Assignment ID,Word Count,Writing Score,Rubric Category,Rubric Category Score
0,2022-01-17,2021-2022,G L Edwards Middle School,"Excel Language Arts 7-23.0120017-711-Larkin, V...",Unknown,Velma Larkin,1151684,Phoenix Johnson,7,26533,Narrative,16211871,24,40,Audience Appropriate Language (Style & Word Ch...,2.0
1,2022-01-17,2021-2022,G L Edwards Middle School,"Excel Language Arts 7-23.0120017-711-Larkin, V...",Unknown,Velma Larkin,1151684,Phoenix Johnson,7,26533,Narrative,16211871,24,40,Conventions Of English,2.0
2,2022-01-17,2021-2022,G L Edwards Middle School,"Excel Language Arts 7-23.0120017-711-Larkin, V...",Unknown,Velma Larkin,1151684,Phoenix Johnson,7,26533,Narrative,16211871,24,40,"Establishing Setting, Point Of View And Charac...",2.0
3,2022-01-17,2021-2022,G L Edwards Middle School,"Excel Language Arts 7-23.0120017-711-Larkin, V...",Unknown,Velma Larkin,1151684,Phoenix Johnson,7,26533,Narrative,16211871,24,40,Narrative Techniques,2.0
4,2022-01-17,2021-2022,G L Edwards Middle School,"Excel Language Arts 7-23.0120017-711-Larkin, V...",Unknown,Velma Larkin,1151684,Phoenix Johnson,7,26533,Narrative,16211871,24,40,Organization,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239248,2019-08-16,2019-2020,Conyers Middle School,"Excel Language Arts 7 - 23.0120017 - Woody, Ke...",Unknown,,721752,Sinai Lugo,8,23345,Narrative,12022015,56,32,Conclusion,1.0
239249,2019-08-16,2019-2020,Conyers Middle School,"Excel Language Arts 7 - 23.0120017 - Woody, Ke...",Unknown,,721752,Sinai Lugo,8,23345,Narrative,12022015,56,32,Conventions of English,1.0
239250,2019-08-16,2019-2020,Conyers Middle School,"Excel Language Arts 7 - 23.0120017 - Woody, Ke...",Unknown,,721752,Sinai Lugo,8,23345,Narrative,12022015,56,32,"Establishing Setting, Voices, and Characters",3.0
239251,2019-08-16,2019-2020,Conyers Middle School,"Excel Language Arts 7 - 23.0120017 - Woody, Ke...",Unknown,,721752,Sinai Lugo,8,23345,Narrative,12022015,56,32,Narrative Techniques,1.0


In [None]:
## change data type into integer: 'Word Count' column

# drop nan for Word Count column
wordcnt2 = wordcnt.dropna(subset=['Word Count'])

# now 'wordcnt' has 229509 rows × 16 columns

# replace ',' with ''
wordcnt2['Word Count'].replace(',','',regex=True,inplace=True)

# convert data type of word count column into integer
wordcnt2['Word Count'] = wordcnt2['Word Count'].astype(int)

In [None]:
wordcnt2

In [None]:
# wordcnt2.groupby(['Enrolled Grade','School Year'])['Word Count'].nunique()

In [None]:
# wordcnt_new

In [None]:
wcoverview = wordcnt_new.groupby(['Enrolled Grade','School Year'])['Word Count'].mean()
wcoverview
# wcoverview.to_csv(os.path.join(path,'word count overview by grades.csv'))

In [None]:
# group by school year, grade level and student name, then take average for all the word count
# group by student assignment ID to remove the duplicates created by rubric category scores

avgcnt = wordcnt2.groupby(['School Year','School Name','Full Name','Enrolled Grade','Student Assignment ID'])['Word Count'].mean()
# change list into dataframe
avgcnt = pd.DataFrame(avgcnt)
avgcnt = avgcnt.reset_index()

In [None]:
# 15044 rows × 5 columns
avgcnt

In [None]:
# count unique names
###### arrange, no need to groupby twice 
avgcnt.groupby('School Year')['Student Assignment ID','Full Name'].nunique()

In [None]:
avgcnt.groupby(['School Year','Enrolled Grade'])['Student Assignment ID','Full Name'].nunique()

In [None]:
# group by and remove assignment ID
avgcnt = avgcnt.groupby(['School Year','School Name','Full Name','Enrolled Grade'])['Word Count'].mean()
avgcnt = pd.DataFrame(avgcnt)
avgcnt = avgcnt.reset_index()
avgcnt

In [None]:
## Remove outliers
z = np.abs(stats.zscore(avgcnt['Word Count']))
print(z)
# threshold = 3
print(np.where(z > 3))
avgcnt_z = avgcnt
avgcnt_z['z'] = z

# remove outliers in wordcnt2

In [None]:
avgcnt_z

In [None]:
avgcnt_z = avgcnt_z[avgcnt_z['z']<3]

# later lsn assignment do they have this step? 

In [None]:
# choose 'enrolled grade' level 3 
avgcnt3 = avgcnt_z[avgcnt_z['Enrolled Grade']==3]

# 978 students from grade 3 

# drop column: enrolled grade
avgcnt3.drop('Enrolled Grade',axis=1,inplace=True)

# remove outlier

In [None]:
avgcnt3

In [None]:
# boxplot: word count vs. school year 
plt.figure(figsize = (10,8))
sns.boxplot(x=avgcnt3['School Year'],y=avgcnt3['Word Count'])
plt.title('Grade 3')
plt.savefig(os.path.join(path,'boxplot grade3.png'))
plt.show()

In [None]:
## Grade 4 
# choose 'enrolled grade' level 4 
avgcnt4 = avgcnt_z[avgcnt_z['Enrolled Grade']==4]

# 1885 students from grade 4  

# drop column: enrolled grade
avgcnt4.drop('Enrolled Grade',axis=1,inplace=True)

# boxplot: word count vs. school year 
plt.figure(figsize = (10,8))
sns.boxplot(x=avgcnt4['School Year'],y=avgcnt4['Word Count'])
plt.title('Grade 4')
plt.savefig(os.path.join(path,'boxplot grade4.png'))
plt.show()

In [None]:
# for grade in range(3,9):
    

In [None]:
# finish for the other grades 

In [None]:
## Grade 5
# choose 'enrolled grade' level 5
avgcnt5 = avgcnt_z[avgcnt_z['Enrolled Grade']==5]

# ---- students from grade 4  

# drop column: enrolled grade
avgcnt5.drop('Enrolled Grade',axis=1,inplace=True)

# boxplot: word count vs. school year 
plt.figure(figsize = (10,8))
sns.boxplot(x=avgcnt5['School Year'],y=avgcnt5['Word Count'])
plt.title('Grade 5')
plt.savefig(os.path.join(path,'boxplot grade5.png'))
plt.show()

In [None]:
## Grade 6
# choose 'enrolled grade' level 6
avgcnt6 = avgcnt_z[avgcnt_z['Enrolled Grade']==6]

# ---- students from grade 6

# drop column: enrolled grade
avgcnt6.drop('Enrolled Grade',axis=1,inplace=True)

# boxplot: word count vs. school year 
plt.figure(figsize = (10,8))
sns.boxplot(x=avgcnt6['School Year'],y=avgcnt6['Word Count'])
plt.title('Grade 6')
plt.savefig(os.path.join(path,'boxplot grade6.png'))
plt.show()

In [None]:
## Grade 7
# choose 'enrolled grade' level 7
avgcnt7 = avgcnt_z[avgcnt_z['Enrolled Grade']==7]

# ---- students from grade 7

# drop column: enrolled grade
avgcnt7.drop('Enrolled Grade',axis=1,inplace=True)

# boxplot: word count vs. school year 
plt.figure(figsize = (10,8))
sns.boxplot(x=avgcnt7['School Year'],y=avgcnt7['Word Count'])
plt.title('Grade 7')
plt.savefig(os.path.join(path,'boxplot grade7.png'))
plt.show()

In [None]:
## Grade 8
# choose 'enrolled grade' level 8
avgcnt8 = avgcnt_z[avgcnt_z['Enrolled Grade']==8]

# ---- students from grade 8

# drop column: enrolled grade
avgcnt8.drop('Enrolled Grade',axis=1,inplace=True)

# boxplot: word count vs. school year 
plt.figure(figsize = (10,8))
sns.boxplot(x=avgcnt8['School Year'],y=avgcnt8['Word Count'])
plt.title('Grade 8')
plt.savefig(os.path.join(path,'boxplot grade8.png'))
plt.show()

In [None]:
# overall
# drop column: enrolled grade
avgcnt_z.drop('Enrolled Grade',axis=1,inplace=True)

# boxplot: word count vs. school year 
plt.figure(figsize = (10,8))
sns.boxplot(x=avgcnt_z['School Year'],y=avgcnt_z['Word Count'])
plt.title('Overall')
# plt.savefig(os.path.join(path,'boxplot grade8.png'))
plt.show()

In [None]:
# avgcnt_z.to_csv(os.path.join(path, 'Avg Word Count (Outlier Removed).csv'))

In [None]:
# z = np.abs(stats.zscore(wordcnt_new['Word Count']))
# print(z)
# # threshold = 3
# print(np.where(z > 3))
# wordcnt_z = wordcnt_new
# wordcnt_z['z'] = z
# wordcnt_z = wordcnt_z[wordcnt_z['z']<3]

In [None]:
# wordcnt_z.to_csv(os.path.join(path, 'Avg Word Count (Outlier Removed).csv'))

In [None]:
# wordcnt_z.groupby('School Year').count()

## arrange

In [None]:
# drop column enrolled grade
avgcnt_new = avgcnt.groupby(['School Year','School Name','Full Name'])['Word Count'].mean()
# change list into dataframe
avgcnt_new = pd.DataFrame(avgcnt_new)
avgcnt_new = avgcnt_new.reset_index()

# reshape, pivot
pvt_avgcnt = avgcnt_new.pivot(index=['School Name','Full Name'], columns='School Year', values='Word Count')

In [None]:
# drop all rows that include nan 
pvt_avgcnt = pvt_avgcnt.dropna()

In [None]:
pvt_avgcnt.reset_index()

In [None]:
# remove the outliers 
avgcnt_new.drop(avgcnt_new[avgcnt_new['Word Count'] >1600].index, inplace=True)

# boxplot: word count vs. school year 
plt.figure(figsize = (10,8))
sns.boxplot(x=avgcnt_new['School Year'],y=avgcnt_new['Word Count'])
plt.show()

## arrange 

In [None]:
# t-test analysis 
##### add print!
ttest_rel(pvt_avgcnt['2019-2020'], pvt_avgcnt['2020-2021'])

# t value is negative, 2020-2021 has improvement compared to 2019-2020

In [None]:
ttest_rel(pvt_avgcnt['2020-2021'], pvt_avgcnt['2021-2022'])

In [None]:
ttest_rel(pvt_avgcnt['2019-2020'], pvt_avgcnt['2021-2022'])

## plots

In [None]:
# ## arrange to the top 
# # drop columns: writing scores, lesson category, class subject, class name, submission date, student ID 
# wordcnt_new = wordcnt.drop(['Submission Date Date','Class Name','Class Subject',
#               'Teacher Name [Sensitive]','Student ID','Writing Lesson Category',
#               'Writing Score','Rubric Category','Rubric Category Score'],
#              axis=1, inplace=True)

In [None]:
# drop column enrolled grade
###### make this a function!


wordcnt_new = wordcnt2.groupby(['School Year','School Name','Full Name',
                                'Enrolled Grade','Lesson ID','Student Assignment ID'])['Word Count'].mean()
# change list into dataframe
wordcnt_new = pd.DataFrame(wordcnt_new)
wordcnt_new = wordcnt_new.reset_index()
wordcnt_new

In [None]:
wordcnt_new.groupby(['Enrolled Grade','School Year'])['Word Count'].mean()

In [None]:
# arrange 

# create three data frames and merge into one 
a = wordcnt.groupby(['School Year','School Name','Full Name'])['Lesson ID'].count()
b = wordcnt_new.groupby(['School Year','School Name','Full Name'])['Word Count'].mean()
c = wordcnt_new.groupby(['School Year','School Name','Full Name'])['Student Assignment ID'].count()

In [None]:
#####  for loop
num_asgmt_lsn = pd.merge(a,b,on=['School Year','School Name','Full Name'], how='inner')
num_asgmt_lsn = pd.merge(num_asgmt_lsn,c,on=['School Year','School Name','Full Name'], how='inner')

In [None]:
num_asgmt_lsn = num_asgmt_lsn.reset_index()

In [None]:
# test if the number of lessons is the same as the number of assignments 
num_asgmt_lsn['Lesson ID'].equals(num_asgmt_lsn['Student Assignment ID'])

In [None]:
# change column name to suggest num of lessons and assignments 
num_asgmt_lsn.rename(columns={num_asgmt_lsn.columns[2]: "Student Name", 
                              num_asgmt_lsn.columns[3]: 'Num of Lessons', 
                              num_asgmt_lsn.columns[5]: 'Num of Assignments Graded'}, inplace = True)

In [None]:
num_asgmt_lsn

In [None]:
## Remove outliers
z1 = np.abs(stats.zscore(num_asgmt_lsn['Word Count']))
print(z1)
# threshold = 3
print(np.where(z1 > 3))
num_asgmt_lsn['z'] = z1

In [None]:
num_asgmt_lsn

In [None]:
num_asgmt_lsn.drop(num_asgmt_lsn[num_asgmt_lsn['z'] >3].index, inplace=True)

In [None]:
num_asgmt_lsn

In [None]:
# scatterplot 
##### remove outlier! 

plt.figure(figsize = (10,8))
sns.regplot(num_asgmt_lsn['Num of Lessons'],num_asgmt_lsn['Word Count'],
            scatter_kws={"color": "black",'s':12}, truncate = False,
            line_kws={"color": "red"}, ci=None)
plt.xlabel('Number of Lessons')
plt.ylabel('Word Count')
plt.title("Number of Lessons vs. Word Count", fontsize=15)
plt.savefig(os.path.join(path,'Number of Lessons vs. Word Count.png'))
plt.show()

## add a correlation for word count vs. scores 

In [None]:
sns.set_style(style='white')
plt.figure(figsize = (10,8))
sns.regplot(num_asgmt_lsn['Num of Assignments Graded'],num_asgmt_lsn['Word Count'],
            scatter_kws={"color": "#2f7fe2",'s':8}, truncate = False, order=1,x_jitter=.1,marker='o',
            line_kws={"color": "#ff6800"}, ci=None)
plt.xlabel('Number of Assignments')
plt.ylabel('Word Count')
plt.title("Number of Assignments vs. Word Count", fontsize=15)
plt.savefig(os.path.join(path,'Number of Assignments vs. Word Count.png'))
plt.show()

In [None]:
sns.lmplot(x="Num of Assignments Graded", y="Word Count", data=num_asgmt_lsn, 
           order=2, ci=None, scatter_kws={"s": 80});

In [None]:
##### change size 
plt.style.use('bmh')
plt.figure(figsize = (10,8))
sns.lmplot(x='Num of Assignments Graded',y='Word Count', data=num_asgmt_lsn,
           truncate = False, hue='School Year',ci=None)
plt.xlabel('Number of Assignments Graded')
plt.ylabel('Word Count')
plt.title("Number of Assignments Graded vs. Word Count", fontsize=15)
plt.savefig(os.path.join(path,'Number of Assignments Graded vs. Word Count by Year.png'))
plt.show()

In [None]:
plt.figure(figsize = (10,8))
sns.lmplot(x='Num of Lessons',y='Word Count', data=num_asgmt_lsn,
           truncate = False, hue='School Year',ci=None)
plt.xlabel('Number of Lessons')
plt.ylabel('Word Count')
plt.title("Number of Lessons vs. Word Count", fontsize=15)
plt.show()

## Regression

In [None]:
# add enrolled grade to the regression
a2 = wordcnt.groupby(['School Year','School Name','Full Name','Enrolled Grade'])['Lesson ID'].count()
b2 = wordcnt_new.groupby(['School Year','School Name','Full Name','Enrolled Grade'])['Word Count'].mean()
c2 = wordcnt_new.groupby(['School Year','School Name','Full Name','Enrolled Grade'])['Student Assignment ID'].count()

In [None]:
df_reg = pd.merge(a2,b2,on=['School Year','School Name','Full Name','Enrolled Grade'],how='outer')
df_reg = pd.merge(df_reg, c2, on=['School Year','School Name','Full Name','Enrolled Grade'], how = 'outer')

In [None]:
df_reg = df_reg.reset_index()

In [None]:
df_reg.rename(columns={df_reg.columns[0]: 'school_year', 
                       df_reg.columns[1]: 'school_name',
                       df_reg.columns[2]: 'student_name', 
                       df_reg.columns[3]: 'enrolled_grade',
                       df_reg.columns[4]: 'num_lessons', 
                       df_reg.columns[5]: 'word_count',
                       df_reg.columns[6]: 'num_asgmts'}, inplace = True)

In [None]:
df_reg.drop(df_reg[df_reg['word_count'] >1000].index, inplace=True)

In [None]:
df_reg

In [None]:
results = smf.ols('word_count ~ num_asgmts + C(enrolled_grade) + C(school_year) + C(school_name)', data=df_reg).fit()
print(results.summary())

# finishing one more assignments will get 19 more words 

###### remove outliers from the beginning

In [None]:
results2 = smf.ols('word_count ~ num_lessons + C(enrolled_grade) + C(school_year) + C(school_name)', data=df_reg).fit()
print(results2.summary())

In [None]:
# lasso for all variables (class, grade, year, school, etc.), see which has the most effect on word count 

In [None]:
# time series analysis: change the submission date into all first date of the month, then categorize three month
# into one category, then conduct time series analysis to see if the there is any trend for the total data 

In [None]:
df_reg = pd.get_dummies(df_reg, columns=['enrolled_grade','school_name','school_year'])

In [None]:
df_reg = df_reg.dropna()
df_reg

In [None]:
Y = df_reg['word_count']
X = df_reg[df_reg.columns.drop(['word_count','student_name','num_lessons'])]
model = sm.OLS(Y, X).fit()
predictions = model.predict(X) 

print_model = model.summary()
print(print_model)

## 19-20 shows negative correlation between word count and num of assignments 
## if 20-21, smallest number of word count 

## Random Forest Regression

In [None]:
df_rf = wordcnt2.groupby(['School Year','School Name','Lesson ID','Teacher Name [Sensitive]',
                          'Enrolled Grade','Writing Lesson Category',
                          'Student Assignment ID'])['Word Count'].mean()
df_rf = pd.DataFrame(df_rf)
df_rf = df_rf.reset_index()
df_rf

# 17689 rows × 8 columns

In [None]:
df_rf = pd.get_dummies(df_rf)
df_rf

# 17689 rows × 225 columns

In [None]:
Y_rf = df_rf['Word Count']
X_rf = df_rf[df_rf.columns.drop(['Word Count'])]

X_train, X_test, y_train, y_test = train_test_split(X_rf,Y_rf,test_size=0.2)

model_rf = RandomForestRegressor(n_estimators=500, oob_score=True, random_state=100)
model_rf.fit(X_train, y_train) 
pred_train_rf= model_rf.predict(X_train)
print('Training RMSE:', np.sqrt(mean_squared_error(y_train,pred_train_rf)))
print('Training R Squared:', r2_score(y_train, pred_train_rf))

pred_test_rf = model_rf.predict(X_test)
print('Testing RMSE:', np.sqrt(mean_squared_error(y_test,pred_test_rf)))
print('Testing R Squared:',r2_score(y_test, pred_test_rf))

# training set R Squared is good, but training set and testing set has a huge difference 

## Barplots

In [None]:
plt.style.use('seaborn-bright')

In [None]:
colors = ["#FF6800", "#2F7FE2", "#42b029"]
customPalette = sns.set_palette(sns.color_palette(colors))

In [None]:
# grade 3 line plot prep
line3 = avgcnt3.groupby('School Year')['Word Count'].mean()
line3 = pd.DataFrame(line3)
line3 = line3.reset_index()
line3

In [None]:
avgcnt3['Word Count'].describe()

In [None]:
# bar charts average value 
plt.figure(figsize = (10,6))
plt.rcParams['savefig.dpi'] = 300
plt.ylim(0,110)
ax = sns.barplot(x=avgcnt3['School Year'],y=avgcnt3['Word Count'], ci=None, palette=customPalette)
plt.title('Grade 3 Avg. Word Count 19-22')

def change_width(ax, new_value) :
    for patch in ax.patches :
        current_width = patch.get_width()
        diff = current_width - new_value

        # we change the bar width
        patch.set_width(new_value)

        # we recenter the bar
        patch.set_x(patch.get_x() + diff * .5)

change_width(ax, .5)

plt.savefig(os.path.join(path,'bar chart grade3.png'))

plt.show()

In [None]:
plt.figure(figsize = (10,6))
plt.rcParams['savefig.dpi'] = 300
plt.ylim(0,300)
ax = sns.boxplot(x=avgcnt3['School Year'],y=avgcnt3['Word Count'], showfliers = False, palette=customPalette)
# ax = sns.boxplot(x=avgcnt3['School Year'],kind='count',ci=None, palette=customPalette)
# sns.lineplot(data=line3, marker='o', palette=['grey'])

plt.title('Grade 3')

change_width(ax, .5)

plt.savefig(os.path.join(path,'bar chart grade3.png'))

plt.show()

In [None]:
avgcnt3

In [None]:
wordcnt_z.groupby(['School Year', 'Enrolled Grade'])['Word Count'].describe()

## Regression bar plots

In [None]:
# plt.figure(figsize = (10,8))
# # sns.barplot(num_asgmt_lsn['Num of Assignments Graded'],num_asgmt_lsn['Word Count'], estimator=sum)
# sns.lmplot(num_asgmt_lsn['Num of Assignments Graded'],num_asgmt_lsn['Word Count'], data=num_asgmt_lsn,
#             scatter_kws={"color": "black",'s':12}, truncate = False, hue='School Year',
#             line_kws={"color": "red"}, ci=None) #, scatter=False)
# plt.xlabel('Number of Assignments')
# plt.ylabel('Word Count')
# plt.title("Number of Assignments Graded vs. Word Count", fontsize=15)
# # plt.savefig(os.path.join(path,'Word Count Bar Chart w/ Regression.png'))
# plt.show()

In [None]:
# num_asgmt_lsn.to_csv(os.path.join(path, 'num_asgmt_lsn.csv'))

In [None]:
# num_asgmt_lsn.pivot(index='School Year', columns='Num of Assignments Graded', values='Word Count') -- duplicates 

In [None]:
# ## Remove outliers
# zn = np.abs(stats.zscore(wordcnt2['Word Count']))
# print(zn)
# # threshold = 3
# print(np.where(zn > 3))
# newdf = wordcnt2
# newdf['z'] = zn

# newdf.drop(newdf[newdf['z'] >3].index, inplace=True)

In [None]:
# newnew = newdf.groupby(['School Year','Student ID','Writing Lesson Category', 'Student Assignment ID'])['Word Count'].mean()
# newnew = pd.DataFrame(newnew)
# newnew = newnew.reset_index()
# newnew

In [None]:
# wc = newnew.groupby(['School Year','Student ID','Writing Lesson Category'])['Word Count'].mean()
# na = newnew.groupby(['School Year','Student ID','Writing Lesson Category'])['Student Assignment ID'].count()

# newnewnew = pd.merge(wc,na,on=['School Year','Student ID','Writing Lesson Category'], how='inner')

# newnewnew = newnewnew.reset_index()

# newnewnew

In [None]:
# newnewnew.describe()

In [None]:
# plt.figure(figsize = (10,8))
# # sns.barplot(num_asgmt_lsn['Num of Assignments Graded'],num_asgmt_lsn['Word Count'], estimator=sum)
# sns.lmplot('Student Assignment ID','Word Count', data=newnewnew,hue='Writing Lesson Category')
# plt.xlabel('Number of Assignments')
# plt.ylabel('Word Count')
# plt.title("Number of Assignments Graded vs. Word Count", fontsize=15)
# # plt.savefig(os.path.join(path,'Word Count Bar Chart w/ Regression.png'))
# plt.show()