In [1]:
from scipy.stats import ttest_ind
from scipy.stats import ttest_rel
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison as multi

Part 1:

In [2]:
import pandas as pd
import numpy as np
import scipy.stats as st

dffb = pd.read_csv("customer_feedback.csv")

In [3]:
dffb.head(10)

Unnamed: 0,date,product,feedback_score
0,2023-02-22,iOS,5
1,2023-05-22,Android,2
2,2022-11-22,iOS,2
3,2022-11-26,Android,10
4,2023-04-26,iOS,1
5,2022-12-04,Android,10
6,2023-08-14,iOS,9
7,2023-03-23,Android,8
8,2023-03-05,iOS,10
9,2023-08-22,Android,5


In [4]:
dffb.shape

(500, 3)

In [5]:
dffb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   date            500 non-null    object
 1   product         500 non-null    object
 2   feedback_score  500 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 11.8+ KB


In [6]:
dfs = pd.read_csv("sales_data.csv")

In [7]:
dfs.head(10)

Unnamed: 0,date,product,sales
0,2022-12-12,iOS,473
1,2022-12-12,Android,919
2,2023-06-24,iOS,805
3,2023-06-24,Android,996
4,2023-10-20,iOS,792
5,2023-10-20,Android,971
6,2023-02-24,iOS,985
7,2023-02-24,Android,329
8,2022-11-20,iOS,878
9,2022-11-20,Android,582


In [8]:
dfs.shape

(500, 3)

In [9]:
dfs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   date     500 non-null    object
 1   product  500 non-null    object
 2   sales    500 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 11.8+ KB


In [10]:
dffb["date"]=pd.to_datetime(dffb["date"], format = "%Y-%m-%d")

In [11]:
dffb.head(10)

Unnamed: 0,date,product,feedback_score
0,2023-02-22,iOS,5
1,2023-05-22,Android,2
2,2022-11-22,iOS,2
3,2022-11-26,Android,10
4,2023-04-26,iOS,1
5,2022-12-04,Android,10
6,2023-08-14,iOS,9
7,2023-03-23,Android,8
8,2023-03-05,iOS,10
9,2023-08-22,Android,5


In [12]:
dffb.shape

(500, 3)

In [13]:
dffb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            500 non-null    datetime64[ns]
 1   product         500 non-null    object        
 2   feedback_score  500 non-null    int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 11.8+ KB


In [14]:
dfs["date"]=pd.to_datetime(dfs["date"], format = "%Y-%m-%d")

In [15]:
dfs.head(10)

Unnamed: 0,date,product,sales
0,2022-12-12,iOS,473
1,2022-12-12,Android,919
2,2023-06-24,iOS,805
3,2023-06-24,Android,996
4,2023-10-20,iOS,792
5,2023-10-20,Android,971
6,2023-02-24,iOS,985
7,2023-02-24,Android,329
8,2022-11-20,iOS,878
9,2022-11-20,Android,582


In [16]:
dfs.shape

(500, 3)

In [17]:
dfs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   date     500 non-null    datetime64[ns]
 1   product  500 non-null    object        
 2   sales    500 non-null    int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 11.8+ KB


Before getting into the analysis, I want to point out that the way the data is presented is incredibly unclear as to how it is related to each other. I want to make it clear how I am interpreting the meaning of the csv files so that my decisions can be understood in the light of my interpretation. Firstly, the sales data represents monthly sales of the same app by on two different platforms but is presented by day in pairs of two, with sometimes 4, 6, and 8 entries on a single day, with 500 entries in total. I am going to interpret this as meaning that the listed day is the day that each app releases its sales data on both platform for the previous month up until that date, and there are 250 apps included in the data. 

Secondly, the feedback data is described as "Contains customer ratings (1-10 scale) for
both iOS and Android apps." It is also in pairs of two but now the dates do not line up at all and they certainly don't line up with the dates of the sales csv file. It is unclear whether the alternating pairs are the same app being reviewed on either platform on different days, whether those apps are the same apps that line up with same index number on the sales csv file, or whether the pattern is inconsequential and they are just random apps on each app store on its own date date with no data being gathered as to which app the review is being left for. It is unclear whether each entry is an individual customer rating made on that date or a specific app's average of all the reviews they received either on that date or over the lifespan of that app. 

Since both datasheets contain such similar datastructures in presentation (250 pairs of 2, alternating iOS and Android) and since some of this analysis seems pointless if it were otherwise, I am going to assume that the apps in each index position pair (e.g. 0 and 1, or 2 and 3, or 10 and 11) are the same app not only in the feedback csv file and in the Sales csv but across files. I am going to assume that the review date shows when the aggregate rating was captured but it is generally stable across that apps lifespan. Therefore, feedback can be compared across apps and can be assigned to a sales number.

Part 2:

Based on how I am interpreting the meaning of the data, this data would be considered paired. We have the aggregate ratings of the same group of apps across two platforms, just captured on different dates. Therefore, each data point is paired by app with one rating for the iOS app store and one for the Android app store. We are just looking for the existence of any difference so we are going to use a two tailed test. 

In [18]:
# Pulling the data from all the entries in the column "feedback_score" and putting the records
# that have "iOS" in the "product" column and putting the entires in one array, and putting
# all the entries that have "Android" in the "Product" column in another array. 
iOS_feedback=np.array(dffb["feedback_score"][dffb["product"]=="iOS"])
Android_feedback=np.array(dffb["feedback_score"][dffb["product"]=="Android"])

In [19]:
print(iOS_feedback)

[ 5  2  1  9 10  1  8 10  9 10  7 10  1 10 10  4 10  7  2  1 10  8  2 10
  2  8  2  3 10  8  7  4  3  5  7  5 10  8  5  3  1  3  4  5  6  3  3 10
  6  6 10 10  7  9  1  2  7  4  4  2  3  4  6  1  9  7  6  7 10  2  2  1
  2  6  5 10  6  6 10  3  6  7  4  6  6  9  9  2  1 10  3  9  3  1  4 10
  1  9  1 10  2  7  4  8  6  9  9  2  1  6  8  9  2  8  1  7  6  2  4  8
  5  9  5  9  7  3  9  4  2  3  9 10  8  4  5  3  7  6  8  6  9  7  8  2
  4  3  6  4  9  9  5  6  3  5  8  1  7 10  9  7 10  8  3  6  3  7 10  7
  9 10  8  8  8  1  3  7  2  9 10  3  6  2  7  7  8  7  9  5 10 10  7  7
  7  1 10  6  8  6  3  1  9 10  6 10  1 10 10  2  8  8  8  4  7  8  5 10
  1  4  7  3  5  4  4  5  1  3  1  3  9  6  5  3  9  9  2  3  6  1  3  4
  2  2  2  7  9  3 10  6  7  3]


In [20]:
iOS_feedback.shape

(250,)

In [21]:
print(Android_feedback)

[ 2 10 10  8  5  1  1  6  9  6  6  5  3  7  9  8  8  5  9 10  6 10  4  4
  1  8 10  6  4  1 10  3  8  4  1 10  9  5  4  2  1  4  5  8  9  4  2  2
  4  2  8  9 10  8  8  2  8  4  7  1 10  6  8  1  3  8  6  7  9  4  5  9
 10  2  9  8  9  9  1  6  2  2  4  4  5  7  5  1  7  3 10  3  2  7 10  1
  9 10 10  8  2  2  4  8 10  1  8  1  3  2  4  1  3  2  2  8  4  4  3  7
 10  3  3  8  2  8  8  8  2  1  4  7  7 10  6  4  6  2  1  6  1  8 10  7
  4  8  5  2  8  5  9  5  3  8  4  8  7  6  2  2  4  7  1  1  9  5  8  2
  1  4  1  3  2  9  7  3  6  1  8 10  6  7 10  1  1  1 10  1  2  7  5  6
  3  6  4  8  4  5  1  2  5  5  8  2  2  8  8 10  4  4 10  9  7  4 10  1
  6  1  1  5  9  5  4  7  7  6  4  5  7 10  7  1  1  2  6  2  1  4  4  6
  7  1  9  2  5  1  5  5  5  9]


In [22]:
Android_feedback.shape

(250,)

In [23]:
# Conjoining the arrays into one two demensional array with 2 rows and 250 columns. The iOS
# array in the first column and the Android array in the second column.
feedback = np.array([iOS_feedback, Android_feedback])

In [24]:
print(feedback)

[[ 5  2  1  9 10  1  8 10  9 10  7 10  1 10 10  4 10  7  2  1 10  8  2 10
   2  8  2  3 10  8  7  4  3  5  7  5 10  8  5  3  1  3  4  5  6  3  3 10
   6  6 10 10  7  9  1  2  7  4  4  2  3  4  6  1  9  7  6  7 10  2  2  1
   2  6  5 10  6  6 10  3  6  7  4  6  6  9  9  2  1 10  3  9  3  1  4 10
   1  9  1 10  2  7  4  8  6  9  9  2  1  6  8  9  2  8  1  7  6  2  4  8
   5  9  5  9  7  3  9  4  2  3  9 10  8  4  5  3  7  6  8  6  9  7  8  2
   4  3  6  4  9  9  5  6  3  5  8  1  7 10  9  7 10  8  3  6  3  7 10  7
   9 10  8  8  8  1  3  7  2  9 10  3  6  2  7  7  8  7  9  5 10 10  7  7
   7  1 10  6  8  6  3  1  9 10  6 10  1 10 10  2  8  8  8  4  7  8  5 10
   1  4  7  3  5  4  4  5  1  3  1  3  9  6  5  3  9  9  2  3  6  1  3  4
   2  2  2  7  9  3 10  6  7  3]
 [ 2 10 10  8  5  1  1  6  9  6  6  5  3  7  9  8  8  5  9 10  6 10  4  4
   1  8 10  6  4  1 10  3  8  4  1 10  9  5  4  2  1  4  5  8  9  4  2  2
   4  2  8  9 10  8  8  2  8  4  7  1 10  6  8  1  3  8  6  7  9  4  5  9
  10 

In [25]:
feedback.shape

(2, 250)

In [26]:
# Cunducting a paired t-test
PairedT = ttest_rel(feedback[0],feedback[1])


In [27]:
print(PairedT)

TtestResult(statistic=np.float64(1.8888927560313111), pvalue=np.float64(0.06006832628158419), df=np.int64(249))


A p-value of ~.0601 is just barely above the threshold of .05 meaning we fail to reject the null hypothesis. This shows that there is just not enough of a difference samples to be certain there is a significant difference between the ratings given people of both population and that the difference seen in the samples isn't caused by statistical noise. Larger samples might lead to the noise going away, or if the the means and standard deviations stay the same then the p-value would shrink and likely cross the threshold to be able to reject the null hypothesis. As it stands we cannot determine that there is a difference in how apple and android users rate the same app across the different platforms and therefore must assume that there isn't a difference in ratings.

Part 3:

In [28]:
# Putting all sales entries that where reported before March 1st, 2023 into one array and
# all that were reported after into a second array. As I am assuming that the date in sales
# represents the previous months sales, I think its only appropriate to exclude the month of
# data where the sales campaign launched in the middle of the data collection. Thus, any sales
# reported in March of 2023 will be excluded.
pre_campaign=np.array(dfs["sales"][dfs["date"] < "2023-03-01"])
post_campaign=np.array(dfs["sales"][dfs["date"] >= "2023-04-01"])

In [29]:
print(pre_campaign)

[473 919 985 329 878 582 449 416 952 133 844 704 357 385 262 265 523 103
 885 278 142 777 955 975 230 965 679 496 687 172 112 767 281 904 881 368
 564 546 914 216 886 911 564 882 922 330 165 832 514 152 808 118 425 368
 561 821 339 398 519 755 956 961 629 681 462 576 543 293 261 654 884 433
 432 961 125 800 891 704 540 579 157 269 180 329 334 615 687 998 947 892
 902 509 810 760 941 120 898 814 259 145 222 102 216 453 916 119 806 803
 565 433 206 657 719 667 836 413 345 404 330 644 392 119 794 808 740 174
 932 990 548 459 713 828 621 995 306 918 493 377 254 817 979 108 214 918
 247 373 816 913]


In [30]:
pre_campaign.shape

(148,)

In [31]:
print(post_campaign)

[805 996 792 971 506 464 989 807 999 384 967 480 530 551 885 459 185 439
 271 837 624 718 409 177 737 667 780 689 345 814 108 981 626 918 270 289
 833 376 636 547 729 705 618 829 244 460 378 549 888 174 813 458 291 295
 580 330 103 621 394 357 691 105 883 961 713 763 838 404 710 917 735 248
 732 147 660 447 835 817 587 705 847 212 142 896 162 765 906 719 784 477
 643 674 899 309 535 134 333 588 813 296 843 625 570 304 349 820 761 719
 755 599 498 520 606 931 875 618 505 773 288 955 784 393 444 778 686 301
 148 575 682 295 805 104 938 409 342 860 661 435 398 293 313 608 946 957
 502 482 205 515 383 264 370 338 894 820 791 515 613 567 467 453 808 647
 893 554 305 686 484 965 737 520 345 339 440 649 973 608 915 690 656 149
 217 670 384 612 262 815 433 188 630 782 443 710 446 436 455 697 526 406
 202 788 838 893 189 784 394 711 917 895 283 120 435 200 477 690 426 781
 465 585 819 298 182 996 990 849 342 536 928 217 244 637 447 251 380 788
 182 881 187 671 752 665 164 626 803 534 851 131 82

In [32]:
post_campaign.shape

(312,)

In [33]:
148 + 312

460

Despite the data being before-and-after, there is nothing tying any of the specific datapoints from before the campaign to after the campaign, like sails on a specific day of the year, or per app before and after, so it is unpaired and independent. We are still just looking for if the campaign had an impact so we will use two-tailed.

In [34]:
# Testing if we need to assume equal or unequal variance based on if the standard deviation
# of the larger array is more than two times greater than the standard deviation of the smaller
EqualVar = True
if pre_campaign.std() > post_campaign.std():
    if (pre_campaign.std() / post_campaign.std()) > 2:
        EqualVar = False
else:
    if (post_campaign.std() / pre_campaign.std()) > 2:
        EqualVar = False

In [35]:
# Intializing the independent t test and printing results
salesT = ttest_ind(pre_campaign, post_campaign, equal_var=EqualVar)
print (salesT)

TtestResult(statistic=np.float64(0.17298529520949715), pvalue=np.float64(0.862739406552252), df=np.float64(458.0))


A p-value of ~0.8627 does not reach the .05 threshold and therefore we fail to reject the null hypothesis. This shows that the campaign almost certainly did not have a significant effect on sales. 

Part 4:

In [36]:
# We are checking if summer months have higher sales than winter months, so we need to create
# a new column that just extracts the month out of the date incase a month is reported across
# more than one year.
dfs["month"] = dfs["date"].dt.month

In [37]:
dfs.head(10)

Unnamed: 0,date,product,sales,month
0,2022-12-12,iOS,473,12
1,2022-12-12,Android,919,12
2,2023-06-24,iOS,805,6
3,2023-06-24,Android,996,6
4,2023-10-20,iOS,792,10
5,2023-10-20,Android,971,10
6,2023-02-24,iOS,985,2
7,2023-02-24,Android,329,2
8,2022-11-20,iOS,878,11
9,2022-11-20,Android,582,11


In [38]:
dfs.shape

(500, 4)

In [39]:
# Putting all the sales data for months 1, 2, and 12 in the winter array and all the sales 
# data from months 6, 7, and 8 into the summer array
winter=np.array(dfs["sales"][(dfs["month"] == 12) | (dfs["month"] < 3)])
summer=np.array(dfs["sales"][(dfs["month"] < 9) & (dfs["month"] > 5)])

In [40]:
print(winter)

[473 919 985 329 449 416 952 133 844 704 357 385 262 265 523 103 885 278
 142 777 955 975 679 496 687 172 112 767 281 904 881 368 564 546 886 911
 564 882 922 330 165 832 514 152 808 118 425 368 561 821 519 755 956 961
 629 681 462 576 543 293 261 654 884 433 432 961 891 704 540 579 157 269
 180 329 334 615 687 998 947 892 902 509 810 760 941 120 898 814 259 145
 222 102 216 453 916 119 806 803 206 657 719 667 836 413 345 404 330 644
 794 808 740 174 548 459 713 828 621 995 306 918 493 377 254 817 979 108
 214 918 247 373 816 913]


In [41]:
print(summer)

[805 996 999 384 530 551 885 459 624 718 780 689 108 981 636 547 618 829
 378 549 888 174 813 458 291 295 394 357 713 763 732 147 835 817 587 705
 142 896 643 674 535 134 813 296 843 625 570 304 349 820 761 719 606 931
 505 773 686 301 682 295 805 104 502 482 370 338 791 515 915 690 656 149
 217 670 630 782 443 710 526 406 189 784 917 895 283 120 182 996 990 849
 342 536 244 637 380 788 182 881 752 665 164 626 271 521 760 308 277 444
 432 822 308 888 579 387 875 614 474 625 664 861 161 562 999 437 607 446]


In [42]:
winter.shape

(132,)

In [43]:
summer.shape

(126,)

This test is no difference from part three in terms of set up as the same data is being evaluated just with a different demarkation of which records to be included and compared against each other. Thus it is still unpaired and independent. We are still just looking for any difference so it is still two tailed

In [44]:
# Settting up the equal variance assumption test again
EqualVar = True
if winter.std() > summer.std():
    if (winter.std() / summer.std()) > 2:
        EqualVar = False
else:
    if (summer.std() / winter.std()) > 2:
        EqualVar = False

In [45]:
# Running the independent t test. 
seasonsT = ttest_ind(winter, summer, equal_var=EqualVar)
print (seasonsT)

TtestResult(statistic=np.float64(-0.09927308556714513), pvalue=np.float64(0.9209991394112975), df=np.float64(256.0))


With a p-value of ~.9210 the score does not meet the threshold of .05 and thus we fail to reject the null hypthesis. Therefore we find no significant difference in the sales levels in summer vs winter.

Part 5:

In [46]:
# Creating the month columns for feedback_score dataframe as we did with the sales dataframe
dffb["month"] = dffb["date"].dt.month

In [47]:
dffb.head(10)

Unnamed: 0,date,product,feedback_score,month
0,2023-02-22,iOS,5,2
1,2023-05-22,Android,2,5
2,2022-11-22,iOS,2,11
3,2022-11-26,Android,10,11
4,2023-04-26,iOS,1,4
5,2022-12-04,Android,10,12
6,2023-08-14,iOS,9,8
7,2023-03-23,Android,8,3
8,2023-03-05,iOS,10,3
9,2023-08-22,Android,5,8


In [48]:
dffb.shape

(500, 4)

In [49]:
dffb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            500 non-null    datetime64[ns]
 1   product         500 non-null    object        
 2   feedback_score  500 non-null    int64         
 3   month           500 non-null    int32         
dtypes: datetime64[ns](1), int32(1), int64(1), object(1)
memory usage: 13.8+ KB


In [50]:
# Renaming all the month entries from numbers to their names so that it will be easier to 
# interperate the analysis.
dffb["month"].replace(
    {1:"January", 2:"February", 3:"March", 4:"February",
     5:"May", 6:"June", 7:"July", 8:"August", 9:"September",
     10:"October", 11:"November", 12:"December"}, inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dffb["month"].replace(


In [51]:
dffb.head(20)

Unnamed: 0,date,product,feedback_score,month
0,2023-02-22,iOS,5,February
1,2023-05-22,Android,2,May
2,2022-11-22,iOS,2,November
3,2022-11-26,Android,10,November
4,2023-04-26,iOS,1,February
5,2022-12-04,Android,10,December
6,2023-08-14,iOS,9,August
7,2023-03-23,Android,8,March
8,2023-03-05,iOS,10,March
9,2023-08-22,Android,5,August


In [52]:
dffb.tail(20)

Unnamed: 0,date,product,feedback_score,month
480,2023-06-13,iOS,2,June
481,2023-05-09,Android,7,May
482,2023-06-21,iOS,2,June
483,2023-08-03,Android,1,August
484,2022-12-17,iOS,2,December
485,2023-04-19,Android,9,February
486,2023-01-04,iOS,7,January
487,2023-08-03,Android,2,August
488,2023-08-02,iOS,9,August
489,2022-12-01,Android,5,December


In [53]:
dffb.shape

(500, 4)

In [54]:
# Creating a dataframe with only the desired months to compare against each other. 
rdffb = st.f_oneway(
    dffb["feedback_score"][dffb["month"] == "January"], 
    dffb["feedback_score"][dffb["month"] == "May"],
    dffb["feedback_score"][dffb["month"] == "September"],
    dffb["feedback_score"][dffb["month"] == "December"]
)

In [55]:
print("ANOVA Results: ", rdffb)

ANOVA Results:  F_onewayResult(statistic=np.float64(0.3146823675455494), pvalue=np.float64(0.8147473590881886))


In [56]:
# For the posthoc analysis, it will read all months, so to get it to only compare the months
# that we want, we will create a separate dataframe that excludes all entries from undesired 
# months.
important_months = ["January", "May", "September", "December"]

In [57]:
dffb_filtered = dffb[dffb["month"].isin(important_months)]

In [58]:
dffb_filtered.shape

(157, 4)

In [59]:
mc = multi(dffb_filtered["feedback_score"], dffb_filtered["month"])
posthoc = mc.tukeyhsd()

In [60]:
print(posthoc)

  Multiple Comparison of Means - Tukey HSD, FWER=0.05   
 group1    group2  meandiff p-adj   lower  upper  reject
--------------------------------------------------------
December   January   0.2081  0.991 -1.6253 2.0416  False
December       May  -0.1019 0.9989 -1.8952 1.6915  False
December September  -0.4239 0.9152 -2.1165 1.2687  False
 January       May    -0.31 0.9726 -2.1664 1.5464  False
 January September   -0.632 0.7871 -2.3913 1.1273  False
     May September   -0.322 0.9619 -2.0394 1.3954  False
--------------------------------------------------------


All the months we are looking at both between each other and in general have a p-value of greater than .05 so we fail to reget the null hypothesis. This signifies that from our data there doesn't appear to be any difference in how people score their products in January, May, September, and December. People don't appear to have greater or less appreciation for their product depending on the time of year they buy it, at least for the months we checked. This helps to justify my assumption that reviews are relatively stable for each app.

Part 6:

In [61]:
# Removing the month column to return the dataframes to how they were previously.
dffb=dffb.drop("month", axis=1)


In [62]:
dffb.head(20)

Unnamed: 0,date,product,feedback_score
0,2023-02-22,iOS,5
1,2023-05-22,Android,2
2,2022-11-22,iOS,2
3,2022-11-26,Android,10
4,2023-04-26,iOS,1
5,2022-12-04,Android,10
6,2023-08-14,iOS,9
7,2023-03-23,Android,8
8,2023-03-05,iOS,10
9,2023-08-22,Android,5


In [63]:
dffb.shape

(500, 3)

In [64]:
dfs=dfs.drop("month", axis=1)

In [65]:
dfs.shape

(500, 3)

In [66]:
dfs.head(20)

Unnamed: 0,date,product,sales
0,2022-12-12,iOS,473
1,2022-12-12,Android,919
2,2023-06-24,iOS,805
3,2023-06-24,Android,996
4,2023-10-20,iOS,792
5,2023-10-20,Android,971
6,2023-02-24,iOS,985
7,2023-02-24,Android,329
8,2022-11-20,iOS,878
9,2022-11-20,Android,582


In [67]:
# Since I am assuming that the apps line up across csv files, I will merge the datafiles
# only on the basis of their index location and not on any other columns like date or product.
df_merged = pd.merge(dffb, dfs, left_index=True, right_index=True, suffixes=('_feedback', '_sales'))

In [68]:
df_merged.head(20)

Unnamed: 0,date_feedback,product_feedback,feedback_score,date_sales,product_sales,sales
0,2023-02-22,iOS,5,2022-12-12,iOS,473
1,2023-05-22,Android,2,2022-12-12,Android,919
2,2022-11-22,iOS,2,2023-06-24,iOS,805
3,2022-11-26,Android,10,2023-06-24,Android,996
4,2023-04-26,iOS,1,2023-10-20,iOS,792
5,2022-12-04,Android,10,2023-10-20,Android,971
6,2023-08-14,iOS,9,2023-02-24,iOS,985
7,2023-03-23,Android,8,2023-02-24,Android,329
8,2023-03-05,iOS,10,2022-11-20,iOS,878
9,2023-08-22,Android,5,2022-11-20,Android,582


In [69]:
df_merged.shape

(500, 6)

In [70]:
# Double checking that all products line up across both csv files. We will remove any rows where the products
# don't line up. Since there should be none of these then we should see an identical shape to the one taken just 
# above. 
df_merged = df_merged[df_merged['product_feedback'] == df_merged['product_sales']]

In [71]:
df_merged.head(20)

Unnamed: 0,date_feedback,product_feedback,feedback_score,date_sales,product_sales,sales
0,2023-02-22,iOS,5,2022-12-12,iOS,473
1,2023-05-22,Android,2,2022-12-12,Android,919
2,2022-11-22,iOS,2,2023-06-24,iOS,805
3,2022-11-26,Android,10,2023-06-24,Android,996
4,2023-04-26,iOS,1,2023-10-20,iOS,792
5,2022-12-04,Android,10,2023-10-20,Android,971
6,2023-08-14,iOS,9,2023-02-24,iOS,985
7,2023-03-23,Android,8,2023-02-24,Android,329
8,2023-03-05,iOS,10,2022-11-20,iOS,878
9,2023-08-22,Android,5,2022-11-20,Android,582


In [72]:
df_merged.shape

(500, 6)

In [73]:
# Now we will drop one of the product columns it is redundant, and rename the other back to just "product."
df_merged=df_merged.drop("product_sales", axis=1)
df_merged.rename(columns={"product_feedback": "product"}, inplace=True)

In [74]:
df_merged.head(20)

Unnamed: 0,date_feedback,product,feedback_score,date_sales,sales
0,2023-02-22,iOS,5,2022-12-12,473
1,2023-05-22,Android,2,2022-12-12,919
2,2022-11-22,iOS,2,2023-06-24,805
3,2022-11-26,Android,10,2023-06-24,996
4,2023-04-26,iOS,1,2023-10-20,792
5,2022-12-04,Android,10,2023-10-20,971
6,2023-08-14,iOS,9,2023-02-24,985
7,2023-03-23,Android,8,2023-02-24,329
8,2023-03-05,iOS,10,2022-11-20,878
9,2023-08-22,Android,5,2022-11-20,582


In [75]:
df_merged.shape

(500, 5)

In [76]:
df_merged["score"] = ["high" if score > 5 else "low" for score in df_merged["feedback_score"]]

In [77]:
df_merged.head(20)

Unnamed: 0,date_feedback,product,feedback_score,date_sales,sales,score
0,2023-02-22,iOS,5,2022-12-12,473,low
1,2023-05-22,Android,2,2022-12-12,919,low
2,2022-11-22,iOS,2,2023-06-24,805,low
3,2022-11-26,Android,10,2023-06-24,996,high
4,2023-04-26,iOS,1,2023-10-20,792,low
5,2022-12-04,Android,10,2023-10-20,971,high
6,2023-08-14,iOS,9,2023-02-24,985,high
7,2023-03-23,Android,8,2023-02-24,329,high
8,2023-03-05,iOS,10,2022-11-20,878,high
9,2023-08-22,Android,5,2022-11-20,582,low


In [78]:
df_merged.shape

(500, 6)

In [79]:
# We will put the high rating sales into one array and the low rating into another
high_rating=np.array(df_merged["sales"][df_merged["score"]=="high"])
low_rating=np.array(df_merged["sales"][df_merged["score"]=="low"])

In [80]:
print(high_rating)

[996 971 985 329 878 989 449 416 952 133 844 704 999 384 357 967 480 530
 551 103 885 459 185 278 837 624 718 249 913 737 142 777 975 814 108 626
 315 247 289 679 376 636 547 687 174 813 458 112 491 103 281 904 394 357
 881 368 691 105 961 838 404 546 745 248 732 147 835 587 705 847 212 142
 896 162 765 477 674 530 911 899 309 564 882 922 330 535 588 813 165 570
 514 586 697 349 118 761 599 425 520 398 606 755 956 961 618 505 773 629
 784 393 444 778 543 686 301 261 884 432 805 938 409 342 398 293 608 946
 205 515 540 269 180 329 264 894 791 515 613 567 453 647 822 263 893 305
 484 965 737 345 339 984 100 892 509 973 810 760 941 814 915 105 217 612
 262 815 849 647 433 418 630 259 145 443 436 335 216 453 455 526 202 838
 189 394 895 120 435 690 916 426 781 585 819 298 996 990 849 342 928 244
 447 251 565 206 657 380 719 667 182 671 836 752 665 164 803 820 330 215
 506 700 392 119 352 401 521 794 528 760 308 852 277 444 139 160 744 257
 740 459 995 822 935 579 387 875 614 876 104 474 86

In [81]:
high_rating.shape

(257,)

In [82]:
print(low_rating)

[473 919 805 792 582 506 464 807 385 262 265 523 439 885 271 409 177 667
 780 689 955 345 981 918 230 965 270 958 133 496 833 172 999 963 729 705
 618 829 244 460 378 549 888 291 295 580 330 767 203 621 883 713 763 710
 917 564 914 216 916 735 660 447 817 906 719 572 304 784 643 427 886 134
 333 296 832 843 625 304 152 820 592 210 808 719 755 368 561 821 498 339
 931 519 875 288 955 681 462 576 293 719 553 148 575 654 433 961 682 295
 104 125 800 860 891 704 661 435 313 957 502 482 579 157 383 370 338 334
 615 820 467 808 687 998 554 686 520 947 440 649 902 608 937 320 120 898
 690 656 149 434 670 384 188 499 782 222 102 710 446 634 697 406 788 893
 784 711 917 283 200 477 119 465 182 536 217 637 806 803 433 788 881 187
 413 626 534 851 131 345 404 828 644 257 366 680 271 808 898 331 458 153
 907 356 123 113 174 932 990 548 222 932 713 828 621 432 756 306 918 308
 888 643 903 535 106 625 664 893 659 607 161 562 999 437 250 902 493 377
 254 108 282 984 642 918 446 373 816]


In [83]:
low_rating.shape

(243,)

Testing the monthly sales for each app per store based on its rating will still be independent and unpaired as there is nothing connecting the high-rating sales' numbers and the low-rating sales' numbers. We are still just looking for a difference so it will again be two-tailed.

In [84]:
# Setting up our equal variation test.
EqualVar = True
if high_rating.std() > low_rating.std():
    if (high_rating.std() / low_rating.std()) > 2:
        EqualVar = False
else:
    if (low_rating.std() / high_rating.std()) > 2:
        EqualVar = False

In [85]:
# Performing test and printing results
ratingsT = ttest_ind(high_rating, low_rating, equal_var=EqualVar)
print (ratingsT)

TtestResult(statistic=np.float64(-0.42984433743953354), pvalue=np.float64(0.6674948065771038), df=np.float64(498.0))


In [86]:
# We will test the correlation with monthly sales and the numeric feedback_score.
# To take correlation we will put the sales columns and the feedback_scores columns into two arrays
sales_array=np.array(df_merged["sales"])
feedback_score_array=np.array(df_merged["feedback_score"])

In [87]:
print(sales_array)

[473 919 805 996 792 971 985 329 878 582 506 464 989 807 449 416 952 133
 844 704 999 384 357 385 262 265 967 480 530 551 523 103 885 459 185 439
 885 278 271 837 624 718 249 913 409 177 737 667 780 689 142 777 955 975
 345 814 108 981 626 918 315 247 230 965 270 289 958 133 679 496 833 376
 636 547 687 172 999 963 729 705 618 829 244 460 378 549 888 174 813 458
 291 295 580 330 112 767 491 203 103 621 281 904 394 357 881 368 691 105
 883 961 713 763 838 404 710 917 564 546 914 216 916 745 735 248 732 147
 660 447 835 817 587 705 847 212 142 896 162 765 906 719 572 304 784 477
 643 674 530 427 886 911 899 309 564 882 922 330 535 134 333 588 813 296
 165 832 843 625 570 304 514 152 586 697 349 820 592 210 808 118 761 719
 755 599 425 368 561 821 498 520 339 398 606 931 519 755 956 961 875 618
 505 773 288 955 629 681 462 576 784 393 444 778 543 293 686 301 719 553
 148 575 261 654 884 433 432 961 682 295 805 104 125 800 938 409 342 860
 891 704 661 435 398 293 313 608 946 957 502 482 20

In [88]:
sales_array.shape

(500,)

In [89]:
print(feedback_score_array)

[ 5  2  2 10  1 10  9  8 10  5  1  1  8  1 10  6  9  9 10  6  7  6 10  5
  1  3 10  7 10  9  4  8 10  8  7  5  2  9  1 10 10  6  8 10  2  4 10  4
  2  1  8  8  2 10  3  6 10  4  8  1  7 10  4  3  3  8  5  4  7  1  5 10
 10  9  8  5  5  4  3  2  1  1  3  4  4  5  5  8  6  9  3  4  3  2 10  2
  6  4  6  2 10  8 10  9  7 10  9  8  1  8  2  2  7  8  4  4  4  7  2  1
  3 10  4  6  6  8  1  1  9  3  7  8  6  6  7  7 10  9  2  4  2  5  1  9
  2 10  6  2  5  9 10  8  6  9  6  9 10  1  3  6  6  2  7  2  4  4  6  4
  6  5  9  7  9  5  2  1  1  7 10  3  3 10  9  3  3  2  1  7  4 10 10  1
  1  9  9 10  1 10 10  8  2  2  7  2  4  4  8  8  6 10  9  1  9  8  2  1
  1  3  6  2  8  4  9  1  2  3  8  2  1  2  7  8  6  4  2  4  4  3  8  7
  5 10  9  3  5  3  9  8  7  2  3  8  9  8  4  8  2  2  3  1  9  4 10  7
  8  7  4 10  5  6  3  4  7  6  6  2  8  1  6  6  9  1  7  8  8 10  2  7
  4  4  3  8  6  5  4  2  9  8  9  5  5  9  6  5  3  3  5  8  8  4  1  8
  7  7 10  6  9  2  7  2 10  4  8  7  3  1  6  1  3

In [90]:
feedback_score_array.shape

(500,)

In [91]:
# Finding the correlation between the sales and the feedback scores
np.corrcoef(sales_array, feedback_score_array)

array([[1.        , 0.00814363],
       [0.00814363, 1.        ]])

A high p-value of ~.6675 is supported by the low correlation of .0081. There appears to be next to no correlation between ratings and monthly sales in our data and our p-value makes us assume that having a rating higher than 5 does not make you any more likely to have higher monthly sales than having a rating of 5 or less. Our p-value does not meet the threshold of .05 and thus we have failed to reject the null hypothesis. As a reflection on the entire analysis we have done, we could not find any significant difference in our samples in any metrics. The only insight we have is that the p-value between the feedback scores of same apps on the iOS app store vs the Android app store was low enough that suggest that more research on the subject, hopefully with a study with a larger sample size, may be worthwhile. 